figure-bpe.tex 4 KB
Newer Older
单韦乔 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
\begin{tikzpicture}
	\tikzstyle{node} =[font=\scriptsize]
	\tikzstyle{sentence} =[font=\scriptsize,fill=blue!5!white]
	
	\node[sentence] (node1) at (0,0) {[`low', `lower', `newest', `widest']};
	\node[sentence,anchor = north] (node2) at ([yshift = -1em]node1.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w e s t $<$e$>$':6, `w i d e s t $<$e$>$':3]};	
	\node[sentence,anchor = north] (node3) at ([yshift = -1.5em]node2.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red es} t $<$e$>$':6, `w i d {\red es} t $<$e$>$':3]};
	\node[sentence,anchor = north] (node4) at ([yshift = -1em]node3.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red est} $<$e$>$':6, `w i d {\red est} $<$e$>$':3]};
	\node[sentence,anchor = north] (node5) at ([yshift = -1em]node4.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red est$<$e$>$}':6, `w i d {\red est$<$e$>$}':3]};
	\node[sentence,anchor = north] (node6) at ([yshift = -1em]node5.south) {$\cdots$};
		
	\node[node,anchor = north] (node7) at ([yshift = -1.6em]node6.south) {直到达到预设的子词词表大小或下一个最高频的字节对出现频率为1。};
	
	\draw[->,line width=.03cm] ([yshift=0em]node1.south) -- ([yshift=0em]node2.north);
	\draw[->,line width=.03cm] ([yshift=0em]node3.south) -- ([yshift=0em]node4.north);
	\draw[->,line width=.03cm] ([yshift=0em]node4.south) -- ([yshift=0em]node5.north);
	\draw[->,line width=.03cm] ([yshift=0em]node5.south) -- ([yshift=0em]node6.north);
	
	\node[node,anchor = west] (node8) at ([xshift = 2em,yshift = 2em]node7.east) {对于词表外的词lowest};
	\node[node,anchor = north west] (node9) at ([yshift = 0.3em]node8.south west) {可以被分割为low est};

	\node[node,font=\scriptsize,anchor = north,fill=ugreen!5,drop shadow] (dict) at ([xshift = 8em,yshift = -5em]node6.south){\begin{tabular}{llllll}
		\multirow{3}{*}{子词词表:} & `es'  & `est' & `est$<$e$>$' & `lo' & `low'   \\
        & `ne'  & `new'&`newest$<$e$>$' & `low$<$e$>$'& `wi'\\
        & `wid' & `widest$<$e$>$' & `lowe' & `lower'& `lower$<$e$>$'
		\end{tabular}};

	\node[node,anchor=west] (line1) at ([xshift = 8em]node1.south east) {按字符拆分,并添加};
	\node[node,anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {终结符$<$e$>$,统计词频。};

	\node[node,anchor=north west] (line3) at ([yshift=-4em]line2.south west) {统计每一个连续字节对};
	\node[node,anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {的出现频率,选择最高};
	\node[node,anchor=north west] (line5) at ([yshift=0.3em]line4.south west) {频者合并成新的子词};
	
	\begin{pgfonlayer}{background}

        %\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line1) (line2) (line3) (line4)] (box1) {};
        \node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=purple] [fit = (node1) (node2)] (box1) {};
        \node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=teal] [fit = (node3) (node4) (node5) (node6)] (box2) {};
        
        \node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!5,drop shadow] [fit = (line1) (line2)] (box3) {};  
        \node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=ugreen!5,drop shadow] [fit = (line3) (line4) (line5)] (box4) {};
        \node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!5,drop shadow] [fit = (node7)] (box5) {};
        \node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!5,drop shadow] [fit = (node8) (node9)] (box6) {};
                       
    \end{pgfonlayer}
    \draw[->,line width=.03cm] ([yshift=0em]box2.south) -- ([yshift=0.2em]node7.north);
    \draw[->,line width=.03cm] ([yshift=0em]box1.south) -- ([yshift=0em]box2.north);
    \draw [->,dotted,very thick,purple] (box3.west) -- ([xshift=-1.5em]box3.west);
    \draw [->,dotted,very thick,teal] (box4.west) -- ([xshift=-1.7em]box4.west);  
    \draw [->,dotted,very thick] ([xshift=6em]dict.north) .. controls +(north:1) and +(south:1) .. (box6.south);

\end{tikzpicture}