// mteval.cpp : Defines the entry point for the console application.

//
// History:
// 
// version 863 -- by hxhou@ict.ac.cn
//    * implement GTM, mWER, mPER score
//    * can process chinese/japanese
//    * support unicode
//    * change some tags: seg->s, sysid->site, DOC->doc
//
//version 11b -- text normalization modified:
//    * take out the join digit line because it joins digits
//      when it shouldn't have
//      $norm_text =~ s/(\d)\s+(?=\d)/$1/g; #join digits
//
// version 11a -- corrected output of individual n-gram precision values
//
// version 11 -- bug fixes:
//    * make filehandle operate in binary mode to prevent Perl from operating
//      (by default in Red Hat 9) in UTF-8
//    * fix failure on joining digits
// version 10 -- updated output to include more details of n-gram scoring.
//    Defaults to generate both NIST and BLEU scores.  Use -b for BLEU
//    only, use -n for NIST only
//
// version 09d -- bug fix (for BLEU scoring, ngrams were fixed at 4
//    being the max, regardless what was entered on the command line.)
//
// version 09c -- bug fix (During the calculation of ngram information,
//    each ngram was being counted only once for each segment.  This has
//    been fixed so that each ngram is counted correctly in each segment.)
//
// version 09b -- text normalization modified:
//    * option flag added to preserve upper case
//    * non-ASCII characters left in place.
//
// version 09a -- text normalization modified:
//    * &quot; and &amp; converted to "" and &, respectively
//    * non-ASCII characters kept together (bug fix)
//
// version 09 -- modified to accommodate sgml tag and attribute
//    names revised to conform to default SGML conventions.
//
// version 08 -- modifies the NIST metric in accordance with the
//    findings on the 2001 Chinese-English dry run corpus.  Also
//    incorporates the BLEU metric as an option and supports the
//    output of ngram detail.
//
// version 07 -- in response to the MT meeting on 28 Jan 2002 at ISI
//    Keep strings of non-ASCII characters together as one word
//    (rather than splitting them into one-character words).
//    Change length penalty so that translations that are longer than
//    the average reference translation are not penalized.
//
// version 06
//    Prevent divide-by-zero when a segment has no evaluation N-grams.
//    Correct segment index for level 3 debug output.
//
// version 05
//    improve diagnostic error messages
//
// version 04
//    tag segments
//
// version 03
//    add detailed output option (intermediate document and segment scores)
//
// version 02
//    accommodation of modified sgml tags and attributes
//
// version 01
//    same as bleu version 15, but modified to provide formal score output.
//
// original IBM version
//    Author: Kishore Papineni
//    Date: 06/10/2001
//////////////////////////////////////////////////////////////////////////////


// Intro

#include <stdlib.h>
#include <math.h>

#pragma warning(disable:4786)
#pragma warning(disable:4503)

#include<iostream>
#include<string>
#include<vector>
#include<map>
#include<fstream>
#include<algorithm>
#include "MTEvalLib.h"

using namespace std;

static PWORDSEGMENTER pwordsegmenter=NULL;

typedef struct {
	int ref_length;
	int min_tst_ref_len;	//>< added by xiejun for bleu_sbp 20090617
	int match_cnt[max_Ngram];
	int tst_count[max_Ngram];
	int ref_count[max_Ngram];
	double tst_info[max_Ngram];
	double ref_info[max_Ngram];
} ngram_struct;

//><added by xiejun for bleu_sbp 20090617
void print_ngram_struct(const string & fn, const ngram_struct& ns)
{
	ofstream f_out(fn.c_str(), ios::app);
	if (!f_out)
	{
		cerr << fn << " open error!" <<endl;
		exit(1);
	}
	f_out << "ref_len min_ref_tst" <<endl;
	f_out << ns.ref_length << " " << ns.min_tst_ref_len <<endl;
	f_out << "ref_count: ";
	for(int i = 0; i < max_Ngram; ++i)
	{
		f_out << ns.ref_count[i] <<" ";
	}
	f_out <<endl;
	f_out << "tst_count: ";
	for(int i = 0; i < max_Ngram; ++i)
	{
		f_out << ns.tst_count[i] <<" ";
	}
	f_out <<endl;
	f_out << "match_cnt: ";
	for(int i = 0; i < max_Ngram; ++i)
	{
		f_out << ns.match_cnt[i] <<" ";
	}
	f_out <<endl;
}

//static int detail=0;
//static int preserve_case=0;

template <class KEY,class VALUE>
static const VALUE & atx(const map<KEY,VALUE> & m, const KEY & k)
{
	typename map<KEY,VALUE>::const_iterator it;
	static VALUE nul;
	it=m.find(k);
	if (it==m.end())
		return nul;
	return it->second;
}

static int split(const string & s, vector<string> & v)
{
	size_t p,q;
	size_t slen;
	v.clear();
	slen=s.length();
	p=0;
	while (p<slen)
	{
		string t;
		q=s.find(' ',p);
		if (q==string::npos)
			q=slen;
		t=s.substr(p,q-p);
		p=q+1;
		if (!t.empty())
		{
			v.push_back(t);
		}
	}
	return v.size();
}

/*oldict
struct Word
{
	string content;  // ʵ
	int index;            // -1δƥ 1ƥĿʼ
};

static const double B=1.1;   //ʱõĵ
static const double W1=1.7;  //ƥϵƬϼȨصȨ
static const double W2=0.1;  //δƥϵƬϼȨصȨ
*/
/*************************************************
ƣsen2Word
Ҫһʴs1Ƿһs2г      
룺[] s1
[] const string &
[] ںɸʵַ

[] s2
[] const vector<string> &
[] Ŵʵ  
[ֵ] int
[] s1s2гˣ򷵻1򷵻0                
޸ļ¼
*  20031227մ
*************************************************/
/*oldict
static int searchInStr( const string & s1,
				const vector<string> & s2 )
{
	string str;

	for( vector<string>::size_type i = 0; i < s2.size(); i++ )
	{
		str += s2[ i ] + " ";
	}

	int	spp = str.find( s1, 0 );

	while( spp != -1 )
	{
		if( str[ spp + s1.size() ] == ' ' &&
			( spp==0 || str[ spp - 1 ] == ' ' ) )
		{
			return 1;
		}

		spp = str.find( s1, spp + 1 );
	}

	return 0;
}
*/
/*************************************************
ƣdealMatchFragment
ҪԱȲοĺϵͳģҳƥƬϣ
һƥƬ      
룺[] can_Word_Vec
[] vector<Word>&
[] ںɸʵַ

[] refWordVec
[] const vector<string>&
[] Ŵʵ  
             
޸ļ¼
*  20031227մ
*************************************************/
/*oldict
static void dealMatchFragment( vector<Word>& canWordVec,
					   const vector<string>& refWordVec )
{
	// ȰǰûƥϣƥϵĴʱΪ1
	vector<Word>::size_type i;
	for( i = 0; i < canWordVec.size(); i++ )
	{
		if( searchInStr( canWordVec[ i ].content, refWordVec ) &&
			canWordVec[ i ].index == -1 )
		{
			canWordVec[ i ].index = 1;
		}
	}

	// ƥƬϽл
	for( i = 0; i < canWordVec.size(); )
	{
		// index1
		if( canWordVec[ i ].index != 1 )
		{
			i++;
		}
		// index1
		else   
		{
			// λƥƬ
			//  add by cpp
			//int end = i,     // ƥƬϵĩβ
			vector<Word>::size_type end = i,
				final = -1;

			while( end < canWordVec.size() )
			{
				if( end == canWordVec.size() - 1 )
				{
					break;
				}

				if( canWordVec[ end ].index == 1 )
				{
					end++;
				}
				else
				{
					break;
				}
			}

			if( end == canWordVec.size() )
			{
				end--;
			}

			if( canWordVec[ end ].index != 1 )
			{
				end --;
			}

			if( end == canWordVec.size() - 1 )
			{
			}
			else if( canWordVec[ end + 1 ].index == 2 
				&& end > i )
			{
				end--;
			}

			if( end == i )
			{
				final = i;
			}
			else
			{
				final = end;

				bool isFound = false;

				while( final > i && !isFound )
				{
					string temp;

					for( vector<Word>::size_type j = i; j <= final; j++ )
					{
						string space = ( j == final ) ? "" : " ";

						temp += canWordVec[ j ].content + space;
					}

					if( searchInStr( temp, refWordVec ) == 1 )
					{
						for( vector<Word>::size_type k = i + 1; k <= final; k++ )
						{
							canWordVec[ k ].index = canWordVec[ k - 1 ].index + 1;	
						}

						isFound = true;
					}
					else
					{
						final--;
					}
				} 
			}

			i = final + 1;
		}
	}
}
*/
/*************************************************
ƣscore
ҪƥƬϵ÷    
룺[] vec
[] const vector<Word> &
[] ѾֺƥƬϵϵͳ

[] averageRefLen
[] double
[] οĵƽ
[ֵ] double
[] صǸϵͳsegmentĵ÷               
޸ļ¼
*  20031227մ
*************************************************/
/*oldict
static double ict_score( const vector<Word> & vec,
			 double averageRefLen )
{
	int length = vec.size(),   // ӵĳ
		count = 0,             // δƥ䵽ĴʵĿ
		total = 0;             // ƥ䵽ĴʵĿ

	double ratio = 0,   // ƥ䵽Ĵھеı
		score = 0;   // 

	vector< int > intVec;  // 洢ÿƬϵĳ

	// ȼδƥʵĿ
	for( vector<Word>::size_type i = 0; i < vec.size(); i++ )
	{
		if( vec[ i ].index == -1 )
		{
			count++;
		}
	}

	// ƥͷ
	ratio = 1 - (double)count / length;

	// intVec
	for( i = 0; i < vec.size(); )
	{
		// ôΪδƥʣ
		if( vec[ i ].index == -1 )
		{
			i++;
		}

		// ôΪƥ
		if( vec[ i ].index == 1 )
		{
			// ôһ
			if( i == vec.size() - 1 )
			{
				intVec.push_back( 1 );

				i++;
			}
			// ôʲǳȴ1ƥƬϵһ
			else if( vec[ i + 1 ].index != 2  )
			{
				intVec.push_back( 1 );

				i++;	
			}
			// ôһƥƬϵĵһ
			else
			{
				while( i < vec.size() )
				{
					if( i == vec.size() - 1 )
					{
						break;
					}

					if( vec[ i + 1 ].index > 1 )
					{
						i++;
					}
					else
					{
						break;
					}
				}

				intVec.push_back( vec[ i ].index );

				i++;
			}
		}
	}

	double lengthPenalty = 0;    // ȳͷ
	double H = 0;                // 
	const double log2 = log(2.0);

	// 
	for( i = 0; i < intVec.size(); i++ )
	{
		//tempǸֵ
		//double temp = (double)intVec[ i ] / ( length - count );
		double temp = (double)intVec[i]/length;
		H += - W1 * temp * (log(temp) / log2);
	}
	H += - W2 * (length - intVec.size()) * (1.0 / length) * (log(1.0 / length) / log2);

	// 㳤ȳͷ
	lengthPenalty = pow( 2.0, fabs( double( vec.size() ) / averageRefLen - 1 ) );


	// ÷
	score = pow( B, -H * lengthPenalty ) * ratio;
	return score;
}
*/

/*************************************************
ƣsegScore
Ҫϵͳsegmentĵ÷   
룺[] refVec
[] const vector<string> &
[] ϵͳĶӦвο

[] can
[] const string &
[] ϵͳsegment
[ֵ] double
[] صǸϵͳsegmentĵ÷               
޸ļ¼
*  20031227մ
*************************************************/
/*oldict
static double ict_score_it(const string & can, const vector<string> & refVec)
{
vector<string> refWordVec,  // Ųο
canWordVec;  // ϵͳ
vector<Word> can_Word_Vec;  // ϵͳĵWord
double averageRefLen = 0;  // οĵƽ

// ȰϵͳеĴȡ
split( can, canWordVec );

// can_Word_Vec
for( vector<string>::size_type i = 0; i < canWordVec.size(); i++ )
{
Word temp;

temp.content = canWordVec[ i ];
temp.index = -1;

can_Word_Vec.push_back( temp );
}

for( i = 0; i < refVec.size(); i++ )
{
split( refVec[ i ], refWordVec );

averageRefLen += refWordVec.size();

// ÿοģϵͳĶҪ֮Ա
// ƥƬ
dealMatchFragment( can_Word_Vec, refWordVec );
}

if (refVec.size()<=0)
return 0.0;
averageRefLen /= refVec.size();

return ict_score( can_Word_Vec, averageRefLen );  
}
*/

static const double B=1.1;   //ʱõĵ
static const double W1=1;  //ƥϵƬϼȨصȨ
static const double W2=6.7;  //δƥϵƬϼȨصȨ

// *************************************************
//ƣscore
//ҪƥƬϵ÷
//[ֵ] double
//      [] صǸϵͳsegmentĵ÷
// *************************************************
static double ict_score(const vector<vector<string>::size_type>& vecMatch, vector<string>::size_type senlen, double averageRefLen)
{
	vector<string>::size_type i;
	double score = 0;           //  
	double lengthPenalty = 0;   //  ȳͷ
	double H = 0;               //  
	double prob;                //  
	const double logb = log(2.0);      //  صĶ
	// 
	for(i=0; i<vecMatch.size(); i++)
	{
		if (vecMatch[i] > 0)    //  ƥϵֵ
		{
			prob = (double)vecMatch[i] / senlen;
			H += -1 * W1 * prob * (log(prob)/logb);
		}
		else                    //  δƥϵֵ
		{
			prob = 1.0 / senlen;
			H += -1 * W2 * prob * (log(prob)/logb);
		}
	}
	// 㳤ȳͷ
	lengthPenalty = pow(2.0, fabs(senlen/averageRefLen - 1));
	// ÷
	score = pow(B, -H * lengthPenalty);
	return score;
}

//////////////////////////////////////////////////////////////////////////
//  ƥӺͲοӣҳƥƬϷvecMap
//  vecCanĵiοvecRefĵjͬvecMap[i]=j
//  ֻҵһƥĴ
//////////////////////////////////////////////////////////////////////////
static void MatchSegment(const SENT & vecCan, const SENT & vecRef, vector<size_t>& vecMap)
{
	vector<bool> vflag;     //ӦvecRefÿһǷѾҵƥ
	size_t i;
	size_t cit;

	vflag.resize(vecRef.length(), false);
	vecMap.resize(vecCan.length(), size_t(-1));
	for (i=0; i<vecCan.size(); i++)
	{
		cit=0;
		while (true)
		{
			cit = vecRef.find(vecCan[i],cit);
			if (cit == SENT::npos)
			{
				break;
			}
			if (!vflag[cit])
			{
				vecMap[i] = cit;
				vflag[cit] = true;
				break;
			}
			cit++;
		}
	}
}

static void Map2Match(const vector<size_t>& vecMap, vector<size_t>& vecMatch)
{
	if (vecMap.empty())
		return;
//	assert(!vecMap.empty());
	size_t i;
	size_t seqlen = 0;
	size_t prevval = vector<string>::size_type(-1);
	vecMatch.clear();

	for (i=0; i<vecMap.size(); i++)
	{
		if (vecMap[i] == vector<string>::size_type(-1))
		{
			if (i > 0)
			{
				vecMatch.push_back(seqlen);
			}
			seqlen = 0;
		}
		else if ((prevval != vector<string>::size_type(-1)) && (vecMap[i] == prevval + 1))
		{
			seqlen++;
		}
		else
		{
			if (i > 0)
			{
				vecMatch.push_back(seqlen);
			}
			seqlen = 1;
		}
		prevval = vecMap[i];
	}
	vecMatch.push_back(seqlen);
}

//////////////////////////////////////////////////////////////////////////
//  Ѵÿһοƥϲ
//////////////////////////////////////////////////////////////////////////
static void MergeMatch(vector<vector<size_t> >& vecTempMatch, vector<size_t>& vecMatch)
{
	size_t i;
	size_t maxindex, maxvalue;
	vector<size_t> vecPos;

	vecMatch.clear();
	vecPos.resize(vecTempMatch.size(), 0);
	while (true)
	{
		maxindex = 0;
		//  Ƿϲҵƥ
		for (i=0; i<vecPos.size(); i++)
		{
			if (vecPos[i] >= vecTempMatch[i].size())
			{
				break;
			}
			if (vecTempMatch[i][vecPos[i]] > vecTempMatch[maxindex][vecPos[maxindex]])
			{
				maxindex = i;
			}
		}
		if (i != vecPos.size())
		{
			break;
		}
		//  vecMatch
		maxvalue = vecTempMatch[maxindex][vecPos[maxindex]];
		vecMatch.push_back(maxvalue);
		//  ƶÿһλ
		if (maxvalue == 0)
		{
			for (i=0; i<vecPos.size(); i++)
			{
				vecPos[i]++;
			}
		}
		else
		{
			for (i=0; i<vecPos.size(); i++)
			{
				maxvalue = vecTempMatch[maxindex][vecPos[maxindex]];
				if (i != maxindex)
				{
					while(maxvalue > 0)
					{
						if (vecTempMatch[i][vecPos[i]] == 0)
						{
							vecPos[i]++;
							maxvalue--;
						}
						else if (vecTempMatch[i][vecPos[i]] > maxvalue)
						{
							vecTempMatch[i][vecPos[i]] -= maxvalue;
							maxvalue = 0;
						}
						else
						{
							maxvalue -= vecTempMatch[i][vecPos[i]];
							vecPos[i]++;
						}
					}
				}
			}
			vecPos[maxindex]++;
		}
	}
	for (i=0; i<vecPos.size(); i++)
	{
		//assert(vecPos[i] == vecTempMatch[i].size());
	}
}

//////////////////////////////////////////////////////////////////////////
//Ҫϵͳsegmentĵ÷   
//룺
//  [] can
//  [] ϵͳsegment
//  [] ref
//  [] ϵͳĶӦвο
//أ
//  ϵͳƬϵĵ÷
//㷨̣
//  1.  canֳɴvecCan
//  2.  ÿһref[i]ִ3-5
//  3.      ref[i]ֳɴvecRef
//  4.      ҵvecCanvecRef֮ƥϵvecMap
//  5.      תvecMapvecTempMatch[i]
//  6.  ôƥ䷽ϲvecTempMatch[i]vecMatch
//  7.  vecMatchscore÷
//㷨˵
//  vecCan: canֳɴʵĽ
//  vecRef: ref[i]ֳɴʵĽ
//  vecMap: vecRefĵkvecCanĵjͬvecMap[j]=k
//  vecTempMatch: ʾƥ
//  vecMatch: ʾƥ{3,1,0,0,2}ʾcanλ0-2,3,6ƥƬϣλ45δƥϵƬ
//////////////////////////////////////////////////////////////////////////
static vector<double> ict_score_it(const SENT & can, const vector<SENT> & ref)
{
	size_t i;
	const SENT & vecCan=can;                      //Ĵʵ
	vector<vector<size_t> > vecTempMatch; //ʾƥϲǰ
	vector<size_t> vecMatch; //ʾƥϲ
	vector<size_t> vecMap;   //ʾӦʵƥ
	double averageRefLen;   // οĵƽ
	//  ʼ
	vecTempMatch.resize(ref.size());
	averageRefLen = 0;
	// canvecCan
	//...done
	//    cout<<"=========vecCan========="<<endl;
	//    print1(vecCan);
	// ѭ
	for(i=0; i<ref.size(); i++)
	{
		const SENT &vecRef=ref[i];                      //ŲοĴʵ
		//  ref[i]vecRef
		// ...done
		//  ҵʶԴʵƥ
		//        cout<<"=========vecRef========="<<endl;
		//        print1(vecRef);
		MatchSegment(vecCan, vecRef, vecMap);
		//        cout<<"=========vecMap========="<<endl;
		//        print1(vecMap);
		//  תƥ
		Map2Match(vecMap, vecTempMatch[i]);
		//        cout<<"=========vecTempMatch["<<i<<"]========="<<endl;
		//        print1(vecTempMatch[i]);
		//  ƽ
		averageRefLen += vecRef.size();
	}
	averageRefLen /= ref.size();
	//  ϲvecTempMatchvecMatch
	MergeMatch(vecTempMatch, vecMatch);
	//    cout<<"=========vecMatch========="<<endl;
	//    print1(vecMatch);

	vector<double> res;
	res.resize(3);
	res[0]=ict_score(vecMatch, vecCan.size(), averageRefLen);
	res[1]=vecCan.size();
	res[2]=1;
	return res;
}

static vector<double> ict_score_seg(const SITEMAP & refs, const SITEMAP & tsts, const string & site, const string & docid, int segid, SEGSCORE & score)
{
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;

	const SENT & seg=atx(atx(atx(tsts,site),docid),segid);

	vector<SENT> ref_segments;

	SITEMAP::const_iterator it2;
	for(it2=refs.begin();it2!=refs.end();it2++)
	{
		const DOCMAP & docs=it2->second;
		ref_segments.push_back(atx(atx(docs,docid),segid));
	}

	res=ict_score_it(seg,ref_segments);

	double sc;
	sc=(res[2]==0?0:res[0]/res[2]);

	score[segid].first=sc;
	score[segid].second=int(res[1]);
	return res;
}

static vector<double> ict_score_doc(const SITEMAP & refs, const SITEMAP & tsts, const string & site, const string & docid, DOCSCORE & score)
{
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;

	const SEGMAP & segs=atx(atx(tsts,site),docid);

	SEGMAP::const_iterator it1;
	int n;
	n=0;
	for(it1=segs.begin();it1!=segs.end();it1++)
	{
		int segid=it1->first;
		const SENT & seg=it1->second;
		vector<double> r;
		r=ict_score_seg(refs,tsts,site,docid,segid,score[docid].second);
		res[0]+=r[0];
		res[1]+=r[1];
		res[2]+=r[2];
		n++;
	}
	double sc;
	sc=(res[2]==0?0:res[0]/res[2]);

	score[docid].first.first=sc;
	score[docid].first.second=int(res[1]);
	return res;
}

double ict_score_system(const SITEMAP & refs, const SITEMAP & tsts, const string & site, SITESCORE & score)
{
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;

	const DOCMAP & docs=atx(tsts,site);

	DOCMAP::const_iterator it1;
	for(it1=docs.begin();it1!=docs.end();it1++)
	{
		string docid=it1->first;
		const SEGMAP & segs=it1->second;
		vector<double> r;
		r=ict_score_doc(refs,tsts,site,docid,score[site].second);
		res[0]+=r[0];
		res[1]+=r[1];
		res[2]+=r[2];
	}
	double sc;
	sc=(res[2]==0?0:res[0]/res[2]);

	score[site].first.first=sc;
	score[site].first.second=int(res[1]);

	return sc;
}

static int gtm_contain(pair<int,int> h, const vector<pair<int,int> > & _hits)
{
	int a;
	for(a=0;a<(int)_hits.size();a++)
	{
		if (_hits[a]==h)
			break;
	}
	return a<(int)_hits.size();
}

static vector<double> gtm_score(const SENT & reflist, const SENT & tstlist)
{
	vector<double> res;
	int _X,_Y;
	double maxhits;
	int i,j,k;
	double xmean;
	vector<pair<int,int> > _hits;
	vector<pair<pair<int,int>,int> > _runs;
	vector<int> xused,yused;
	_X=reflist.length();
	_Y=tstlist.length();
	maxhits=_Y;
	j=k=0;
	for(i=0;i<(int)reflist.length();i++)
	{
		if (reflist[i]==-1)
			k++;
		else
			j++;
	}
	xmean=(double)j/k;
	if (xmean<maxhits)
		maxhits=xmean;
	_hits.resize(_X*_Y);
	k=0;
	for(i=0;i<(int)reflist.length();i++)
	{
		int reftoken=reflist[i];
		for(j=0;j<(int)tstlist.length();j++)
		{
			if (reftoken==tstlist[j])
			{
				_hits[k++]=pair<int,int>(i,j);
			}
		}
	}
	_hits.resize(k);
	k=0;
	for(i=0;i<(int)_hits.size();i++)
	{
		int x,y;
		pair<int,int> h2;
		int l;
		x=_hits[i].first;
		y=_hits[i].second;
		h2=_hits[i];
		while (h2.first<_X-1 && h2.second<_Y-1)
		{
			h2.first++;
			h2.second++;
			if (!gtm_contain(h2,_hits))
			{
				h2.first--;
				h2.second--;
				break;
			}
		}
		l=(h2.second*_X+h2.first-_hits[i].second*_X-_hits[i].first)/(_X+1)+1;
		for(j=1;j<=l;j++)
		{
			//			k=gtm_findpos(_hits[i],j,_runs);
			_runs.push_back(pair<pair<int,int>,int>(_hits[i],j));
		}
	}
	xused.resize(_X);
	for(i=0;i<_X;i++)
		xused[i]=0;
	yused.resize(_Y);
	for(i=0;i<_Y;i++)
		yused[i]=0;

	double sc=0;
	for(i=0;i<(int)_runs.size();i++)
	{
		pair<int,int> r;
		double l;
		int con;
		int x,y;
		r=_runs[i].first;
		l=_runs[i].second;
		if (maxhits<=0)
			break;
		if (l>maxhits)
			l=maxhits;
		x=r.first;
		y=r.second;
		con=0;
		for(j=0;j<l && !con;j++)
		{
			if (xused[x+j] || yused[y+j])
				con=1;
		}
		if (con)
			continue;
		for(j=0;j<l;j++)
		{
			xused[x+j]=1;
			yused[y+j]=1;
		}
		maxhits-=l;
		sc+=l;
	}
	res.resize(3);
	res[0]=sc;
	res[1]=_Y;
	res[2]=xmean;

	//	_tprintf(_T("%f,%d,%f\n"),sc,_Y,xmean);
	//	exit(0);

	return res;
}

static vector<double> gtm_score_it(const SENT & seg, const vector<SENT> & ref_segments)
{
	SENT reflist;

	int i;
	reflist.clear();
	for(i=0;i<(int)ref_segments.size();i++)
	{
		const SENT & s=ref_segments[i];
		//		_tprintf(_T("%s\n"),u2t(s).c_str());
		reflist+=s;
		reflist.push_back(-1);
	}
	//	_tprintf(_T("%d\n"),reflist.size());

	return gtm_score(reflist,seg);
}

static vector<double> gtm_score_seg(const SITEMAP & refs, const SITEMAP & tsts, const string & site, const string & docid, int segid, SEGSCORE & score)
{
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;

	const SENT & seg=atx(atx(atx(tsts,site),docid),segid);

	vector<SENT> ref_segments;

	SITEMAP::const_iterator it2;
	for(it2=refs.begin();it2!=refs.end();it2++)
	{
		const DOCMAP & docs=it2->second;
		ref_segments.push_back(atx(atx(docs,docid),segid));
	}

	res=gtm_score_it(seg,ref_segments);

	double prc,rcl,sc;
	prc=(res[1]==0?0:res[0]/res[1]);
	rcl=(res[2]==0?0:res[0]/res[2]);
	sc=(prc+rcl==0?0:2*prc*rcl/(prc+rcl));

	score[segid].first=sc;
	score[segid].second=int(res[1]);

	return res;
}

static vector<double> gtm_score_doc(const SITEMAP & refs, const SITEMAP & tsts, const string & site, const string & docid, DOCSCORE & score)
{
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;

	const SEGMAP & segs=atx(atx(tsts,site),docid);

	SEGMAP::const_iterator it1;
	for(it1=segs.begin();it1!=segs.end();it1++)
	{
		int segid=it1->first;
		const SENT & seg=it1->second;
		vector<double> r;
		r=gtm_score_seg(refs,tsts,site,docid,segid,score[docid].second);
		res[0]+=r[0];
		res[1]+=r[1];
		res[2]+=r[2];
	}
	double prc,rcl,sc;
	prc=(res[1]==0?0:res[0]/res[1]);
	rcl=(res[2]==0?0:res[0]/res[2]);
	sc=(prc+rcl==0?0:2*prc*rcl/(prc+rcl));

	score[docid].first.first=sc;
	score[docid].first.second=int(res[1]);

	return res;
}

double gtm_score_system(const SITEMAP & refs, const SITEMAP & tsts, const string & site, SITESCORE & score)
{
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;

	const DOCMAP & docs=atx(tsts,site);

	DOCMAP::const_iterator it1;
	for(it1=docs.begin();it1!=docs.end();it1++)
	{
		string docid=it1->first;
		const SEGMAP & segs=it1->second;
		vector<double> r;
		r=gtm_score_doc(refs,tsts,site,docid,score[site].second);
		res[0]+=r[0];
		res[1]+=r[1];
		res[2]+=r[2];
	}
	double prc,rcl,sc;
	prc=(res[1]==0?0:res[0]/res[1]);
	rcl=(res[2]==0?0:res[0]/res[2]);
	sc=(prc+rcl==0?0:2*prc*rcl/(prc+rcl));

	score[site].first.first=sc;
	score[site].first.second=int(res[1]);

	return sc;
}

static vector<double> mwer_Levenshtein(const SENT & seg, const SENT & ref)
{
	vector<int> mt;
	vector<double> res;
	int i,j;
	int _Y;
	int _X;
	int v;
	//	printf("%s\n%s\n",u2t(seg).c_str(),u2t(ref).c_str());

	_Y=seg.length();
	_X=ref.length();
	v=_Y+1;
	mt.resize((_X+1)*v);
	mt[0]=0;
	for(j=0;j<_Y;j++)
		mt[0*v+j+1]=mt[0*v+j]+1; // ins
	for(i=0;i<_X;i++)
	{
		mt[(i+1)*v+0]=mt[i*v+0]+1; // del
		for(j=0;j<_Y;j++)
		{
			int a,b,c;
			a=mt[i*v+j]+(seg[j]==ref[i]?0:1); // sub
			b=mt[i*v+j+1]+1; // del
			c=mt[(i+1)*v+j]+1; // ins
			mt[(i+1)*v+j+1]=a<b?(a<c?a:c):(b<c?b:c);
		}
	}
	res.resize(3);
	res[0]=mt[_X*v+_Y];
	res[1]=_Y;
	res[2]=_X;
	//	printf("%f %f %f\n",res[0],res[1],res[2]);
	return res;
}

static vector<double> mwer_score_seg(const SITEMAP & refs, const SITEMAP & tsts, const string & site, const string & docid, int segid, SEGSCORE & score)
{
	const SENT & seg=atx(atx(atx(tsts,site),docid),segid);
	vector<double> res;
	vector<SENT> ref_segments;
	res.resize(3);
	res[0]=res[1]=res[2]=-1;

	SITEMAP::const_iterator it2;
	for(it2=refs.begin();it2!=refs.end();it2++)
	{
		vector<double> r;
		const DOCMAP & docs=it2->second;
		r=mwer_Levenshtein(seg,atx(atx(docs,docid),segid));
		if (res[0]>r[0] || res[0]<0)
			res=r;
	}
	if (res[0]<0)
		res[0]=res[1]=res[2]=0;

	double sc;

	sc=res[1]==0&&res[2]==0?0:res[0]/(res[1]>res[2]?res[1]:res[2]);

	score[segid].first=sc;
	score[segid].second=int(res[1]);

	return res;
}

static vector<double> mwer_score_doc(const SITEMAP & refs, const SITEMAP & tsts, const string & site, const string & docid, DOCSCORE & score)
{
	const SEGMAP & segs=atx(atx(tsts,site),docid);
	SEGMAP::const_iterator it1;
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;
	for(it1=segs.begin();it1!=segs.end();it1++)
	{
		int segid=it1->first;
		const SENT & seg=it1->second;
		vector<double> r;
		r=mwer_score_seg(refs,tsts,site,docid,segid,score[docid].second);
		res[0]+=r[0];
		res[1]+=r[1];
		res[2]+=r[2];
	}
	double sc;

	sc=res[1]==0&&res[2]==0?0:res[0]/(res[1]>res[2]?res[1]:res[2]);
	score[docid].first.first=sc;
	score[docid].first.second=int(res[1]);

	return res;
}

double mwer_score_system(const SITEMAP & refs, const SITEMAP & tsts, const string & site, SITESCORE & score)
{
	const DOCMAP & docs=atx(tsts,site);

	DOCMAP::const_iterator it1;
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;
	for(it1=docs.begin();it1!=docs.end();it1++)
	{
		string docid=it1->first;
		const SEGMAP & segs=it1->second;
		vector<double> r;
		r=mwer_score_doc(refs,tsts,site,docid,score[site].second);
		res[0]+=r[0];
		res[1]+=r[1];
		res[2]+=r[2];
	}
	double sc;

	sc=res[1]==0&&res[2]==0?0:res[0]/(res[1]>res[2]?res[1]:res[2]);

	score[site].first.first=sc;
	score[site].first.second=int(res[1]);

	return sc;
}

static vector<double> mper_score_it(const SENT & seg, const SENT & ref)
{
	vector<int> msrc,mref;
	vector<double> res;
	int i,j;
	int _Y;
	int _X;
	//	printf("%s\n%s\n",u2t(seg).c_str(),u2t(ref).c_str());

	_Y=seg.length();
	_X=ref.length();
	msrc.resize(_Y);
	mref.resize(_X);

	for(i=0;i<_Y;i++)
		msrc[i]=0;
	for(j=0;j<_X;j++)
		mref[j]=0;

	for(i=0;i<_Y;i++)
	{
		for(j=0;j<_X;j++)
		{
			if (mref[j]==0 && seg[i]==ref[j])
			{
				mref[j]=1;
				msrc[i]=1;
				break;
			}
		}
	}

	int sc,sc1;
	sc=0;
	for(i=0;i<_Y;i++)
		if (msrc[i]==0)
			sc++;
	sc1=0;
	for(j=0;j<_X;j++)
		if (mref[j]==0)
			sc1++;

	if (sc1>sc) sc=sc1;

	res.resize(3);
	res[0]=sc;
	res[1]=_Y;
	res[2]=_X;
	//	printf("%f %f %f\n",res[0],res[1],res[2]);
	return res;
}

static vector<double> mper_score_seg(const SITEMAP & refs, const SITEMAP & tsts, const string & site, const string & docid, int segid, SEGSCORE & score)
{
	const SENT & seg=atx(atx(atx(tsts,site),docid),segid);
	vector<double> res;
	vector<string> ref_segments;
	res.resize(3);
	res[0]=res[1]=res[2]=-1;

	SITEMAP::const_iterator it2;
	for(it2=refs.begin();it2!=refs.end();it2++)
	{
		vector<double> r;
		const DOCMAP & docs=it2->second;
		r=mper_score_it(seg,atx(atx(docs,docid),segid));
		if (res[0]>r[0] || res[0]<0)
			res=r;
	}
	if (res[0]<0)
		res[0]=res[1]=res[2]=0;

	double sc;

	sc=res[1]==0&&res[2]==0?0:res[0]/(res[1]>res[2]?res[1]:res[2]);

	score[segid].first=sc;
	score[segid].second=int(res[1]);

	return res;
}

static vector<double> mper_score_doc(const SITEMAP & refs, const SITEMAP & tsts, const string & site, const string & docid, DOCSCORE & score)
{
	const SEGMAP & segs=atx(atx(tsts,site),docid);

	SEGMAP::const_iterator it1;
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;
	for(it1=segs.begin();it1!=segs.end();it1++)
	{
		int segid=it1->first;
		const SENT & seg=it1->second;
		vector<double> r;
		r=mper_score_seg(refs,tsts,site,docid,segid,score[docid].second);
		res[0]+=r[0];
		res[1]+=r[1];
		res[2]+=r[2];
	}
	double sc;

	sc=res[1]==0&&res[2]==0?0:res[0]/(res[1]>res[2]?res[1]:res[2]);
	score[docid].first.first=sc;
	score[docid].first.second=int(res[1]);

	return res;
}

double mper_score_system(const SITEMAP & refs, const SITEMAP & tsts, const string & site, SITESCORE & score)
{
	const DOCMAP & docs=atx(tsts,site);

	DOCMAP::const_iterator it1;
	vector<double> res;
	res.resize(3);
	res[0]=res[1]=res[2]=0;
	for(it1=docs.begin();it1!=docs.end();it1++)
	{
		string docid=it1->first;
		const SEGMAP & segs=it1->second;
		vector<double> r;
		r=mper_score_doc(refs,tsts,site,docid,score[site].second);
		res[0]+=r[0];
		res[1]+=r[1];
		res[2]+=r[2];
	}
	double sc;

	sc=res[1]==0&&res[2]==0?0:res[0]/(res[1]>res[2]?res[1]:res[2]);

	score[site].first.first=sc;
	score[site].first.second=int(res[1]);

	return sc;
}

static void Words2Ngrams(const SENT & words,map<SENT,int> & count)
{
	int i;
	count.clear();
	for(i=0;i<(int)words.size();i++)
	{
		int j;
		SENT ngram;
		ngram.clear();
		for(j=0;j<max_Ngram && i+j<(int)words.size();j++)
		{
			ngram.push_back(words[i+j]);
			if (count.find(ngram)!=count.end())
				count[ngram]++;
			else
				count[ngram]=1;
		}
	}
}

static double nist_length_penalty(double ratio)
{
	if (ratio>=1)
		return 1;
	if (ratio<=0)
		return 0;
	double ratio_x = 1.5;
	double score_x = 0.5;
	double beta = -log(score_x)/log(ratio_x)/log(ratio_x);
	return exp (-beta*log(ratio)*log(ratio));
}

static double nist_score(const SITEMAP & refs, const string site, const ngram_struct & res, NSCOREMAP & SCOREmt)
{
	int n;
	double score,iscore;
	score=0;
	SCOREmt[site].resize(max_Ngram);
	for (n=0; n<max_Ngram; n++)
	{
		score += (double)res.tst_info[n]/max(res.tst_count[n],1);
		SCOREmt[site][n].cum = score * nist_length_penalty((double)res.tst_count[0]/((double)res.ref_count[0]/refs.size()));
		double k;
		k=SCOREmt[site][n].cum;
		iscore = (double)res.tst_info[n]/max(res.tst_count[n],1);
		SCOREmt[site][n].ind = iscore * nist_length_penalty((double)res.tst_count[0]/((double)res.ref_count[0]/refs.size()));
	}
	double k;
	k=SCOREmt[site][NIST_ORDER-1].cum; 
	return k;
}

static double bleu_score(const string site, const ngram_struct & res, NSCOREMAP & SCOREmt, int i_sbp)
{
	//print_ngram_struct("ngram info",res);

	double score = 0;
	double iscore = 0;
	//>< modified by xiejun for bleu_sbp 20090617		--->beg

	//double len_score = min (0.0, 1-(double)res.ref_length/res.tst_count[0]);
	
	double len_score = 0;
	if ( i_sbp == -1 && (double)res.ref_length/res.min_tst_ref_len > 1 )	//>< i_sbp = -1 for bleu_sbp
		len_score = 1 - (double)res.ref_length/res.min_tst_ref_len; 

	else if(i_sbp == 0 && (double)res.ref_length/res.tst_count[0] >1)	//>< i_sbp = 0 for bleu 
		len_score = 1 - (double)res.ref_length/res.tst_count[0];

	//>< modified by xiejun for bleu_sbp 20090617		--->end

	int j;

	SCOREmt[site].resize(max_Ngram);
	for (j=0; j<max_Ngram; j++)
	{
		if (res.match_cnt[j] == 0)
		{
			SCOREmt[site][j].cum=0;
		}
		else
		{
			// Cumulative N-Gram score
			score += log ((double)res.match_cnt[j]/res.tst_count[j]);
			SCOREmt[site][j].cum = exp(score/(j+1) + len_score);
			// Individual N-Gram score
			iscore = log ((double)res.match_cnt[j]/res.tst_count[j]);
			SCOREmt[site][j].ind = exp(iscore);
		}
	}
	return SCOREmt[site][BLEU_ORDER-1].cum;
}

static ngram_struct score_segment(const SITEMAP & refs, const SITEMAP & tsts,
								  const GRAMMAP & ngram_info, 
								  const string & site, const string & docid, int segid,
								  int nist, SEGSCORE & score)
{
	const SENT & seg=atx(atx(atx(tsts,site),docid),segid);
	ngram_struct res;
	int j;
	res.ref_length=0;
	for(j=0;j<max_Ngram;j++)
	{
		res.match_cnt[j]=res.ref_count[j]=res.tst_count[j]=0;
		res.tst_info[j]=res.ref_info[j]=0;
	}

	map<SENT,int> tst_ngrams;
	const SENT & tst_wrds=seg;

	// get the ngram counts for the test segment
	Words2Ngrams(tst_wrds,tst_ngrams);

	for(j=0;j<max_Ngram;j++) // compute ngram counts
		res.tst_count[j]=j<(int)tst_wrds.length()?tst_wrds.length()-j:0;

	// get the ngram counts for the reference segments
	SITEMAP::const_iterator it;
	map<SENT,int> ref_ngrams_max;
	res.ref_length=-1;

	//>< added by xiejun for bleu_sbp 20090617
	//><  bleu_sbp use the minimum of the shortest reference length and candidate length
	res.min_tst_ref_len = tst_wrds.length();

	for(it=refs.begin();it!=refs.end();it++)
	{
		const DOCMAP & docs=it->second;
		const SEGMAP & segs=atx(docs,docid);
		const SENT & ref_seg=atx(segs,segid);
		const SENT & ref_wrds=ref_seg;
		map<SENT,int> ref_ngrams;
		Words2Ngrams(ref_wrds,ref_ngrams);
		map<SENT,int>::iterator it2;
		for(it2=ref_ngrams.begin();it2!=ref_ngrams.end();it2++)
		{
			// find the maximum # of occurrences
			const SENT & ngram=it2->first;
			const SENT & wrds=ngram;
			res.ref_info[wrds.length()-1]+=atx(ngram_info,ngram);
			if (ref_ngrams_max.find(ngram)!=ref_ngrams_max.end())
			{
				if (ref_ngrams_max[ngram]<ref_ngrams[ngram])
					ref_ngrams_max[ngram]=ref_ngrams[ngram];
			}
			else
				ref_ngrams_max[ngram]=ref_ngrams[ngram];
		}
		for(j=0;j<max_Ngram;j++)
			res.ref_count[j]+=j<(int)ref_wrds.length()?ref_wrds.length()-j:0;
		if (res.ref_length==-1 || (int)ref_wrds.length()<res.ref_length)	//><shortest reference length
			res.ref_length=ref_wrds.length();
		if (ref_wrds.size() < res.min_tst_ref_len)
		{
			res.min_tst_ref_len = ref_wrds.size();
		}
	}

	// accumulate scoring stats for tst_seg ngrams that match ref_seg ngrams
	map<SENT,int>::iterator it3;
	for(it3=tst_ngrams.begin();it3!=tst_ngrams.end();it3++)
	{
		const SENT & ngram=it3->first;
		if (ref_ngrams_max.find(ngram)==ref_ngrams_max.end())
			continue;
		const SENT & wrds=ngram;
		res.tst_info[wrds.length()-1] += 
			atx(ngram_info,ngram) * min(tst_ngrams[ngram],ref_ngrams_max[ngram]);
		res.match_cnt[wrds.length()-1] += min(tst_ngrams[ngram],ref_ngrams_max[ngram]);

		//if (detail>=3)
		//	_tprintf(_T("%.2f info for each of %d %d-grams = '%s'\n"), 
		//	atx(ngram_info,ngram),
		//	min(tst_ngrams[ngram],ref_ngrams_max[ngram]),
		//	wrds.size(),
		//	u2t(ngram).c_str()
		//	);

	}

	NSCOREMAP SEGmt;
	if (nist == 1)
		score[segid].first=nist_score(refs,site,res,SEGmt);
	else if (nist == 0 || nist == -1)
	{
		score[segid].first=bleu_score(site,res,SEGmt, nist);
	}

	score[segid].second=res.tst_count[0];

	return res;
}

static ngram_struct score_document(const SITEMAP & refs, const SITEMAP & tsts,
								   const GRAMMAP & ngram_info,
								   const string & site, const string & docid,
								   int nist, DOCSCORE & score)
{
	const SEGMAP & segs=atx(atx(tsts,site),docid);

	SEGMAP::const_iterator it1;
	ngram_struct res;
	int j;
	res.ref_length=0;
	res.min_tst_ref_len = 0;
	for(j=0;j<max_Ngram;j++)
	{
		res.match_cnt[j]=res.ref_count[j]=res.tst_count[j]=0;
		res.tst_info[j]=res.ref_info[j]=0;
	}
	for(it1=segs.begin();it1!=segs.end();it1++)
	{
		int segid=it1->first;
		const SENT & seg=it1->second;
		ngram_struct r;
//		NSCOREMAP DOCmt;
		r=score_segment(refs,tsts,ngram_info,site,docid,segid,nist,score[docid].second);

		res.ref_length+=r.ref_length;
		res.min_tst_ref_len += r.min_tst_ref_len; //><added by xiejun for bleu_sbp 2090617
		for(j=0;j<max_Ngram;j++)
		{
			res.match_cnt[j]+=r.match_cnt[j];
			res.tst_count[j]+=r.tst_count[j];
			res.ref_count[j]+=r.ref_count[j];
			res.tst_info[j]+=r.tst_info[j];
			res.ref_info[j]+=r.ref_info[j];
		}
	}

	NSCOREMAP DOCmt;
	if (nist == 1)
		score[docid].first.first=nist_score(refs,site,res,DOCmt);
	else if (nist == -1 || nist == 0)
		score[docid].first.first=bleu_score(site,res,DOCmt, nist);
	score[docid].first.second=res.tst_count[0];

	return res;
}

void score_system(const SITEMAP & refs, const SITEMAP & tsts,
				  const GRAMMAP & ngram_info,
				  const string & site,
				  NSCOREMAP & SCOREmt,
				  int nist, SITESCORE & score)
{
	const DOCMAP & docs=atx(tsts,site);

	DOCMAP::const_iterator it1;
	ngram_struct res;
	int j;
	res.ref_length=0;
	res.min_tst_ref_len = 0;
	for(j=0;j<max_Ngram;j++)
	{
		res.match_cnt[j]=res.ref_count[j]=res.tst_count[j]=0;
		res.tst_info[j]=res.ref_info[j]=0;
	}
	for(it1=docs.begin();it1!=docs.end();it1++)
	{
		const string & docid=it1->first;
		const SEGMAP & segs=it1->second;
		ngram_struct r;
//		NSCOREMAP DOCmt;
		r=score_document(refs,tsts,ngram_info,site,docid,nist,score[site].second);

		res.ref_length+=r.ref_length;
		res.min_tst_ref_len += r.min_tst_ref_len; //><added by xiejun for bleu_sbp 2090617

		for(j=0;j<max_Ngram;j++)
		{
			res.match_cnt[j]+=r.match_cnt[j];
			res.tst_count[j]+=r.tst_count[j];
			res.ref_count[j]+=r.ref_count[j];
			res.tst_info[j]+=r.tst_info[j];
			res.ref_info[j]+=r.ref_info[j];
		}
	}

	//><modified by xiejun for bleu_sbp 20090617
	//if ( nist)
	//	score[site].first.first=nist_score(refs,site,res,SCOREmt);
	//else 
	//	score[site].first.first=bleu_score(site,res,SCOREmt);

	if ( nist == 1)
		score[site].first.first=nist_score(refs,site,res,SCOREmt);
	else if (nist == 0 || nist == -1)
		score[site].first.first=bleu_score(site,res,SCOREmt, nist);


	score[site].first.second=res.tst_count[0];

}

void compute_ngram_info(const SITEMAP & refs, GRAMMAP & ngram_info)
{
	int tot_wrds;
	SITEMAP::const_iterator it1;
	map<SENT,int> ngram_count;
	tot_wrds=0;
	for(it1=refs.begin();it1!=refs.end();it1++)
	{
		string site=it1->first;
		const DOCMAP & docs=it1->second;
		DOCMAP::const_iterator it2;
		for(it2=docs.begin();it2!=docs.end();it2++)
		{
			string docid=it2->first;
			const SEGMAP & segs=it2->second;
			SEGMAP::const_iterator it3;
			for(it3=segs.begin();it3!=segs.end();it3++)
			{
				int segid=it3->first;
				const SENT & seg=it3->second;
				map<SENT,int> ngrams;
				tot_wrds+=seg.size();
				Words2Ngrams(seg,ngrams);
				map<SENT,int>::iterator it4;
				for(it4=ngrams.begin();it4!=ngrams.end();it4++)
				{
					if (ngram_count.find(it4->first)!=ngram_count.end())
						ngram_count[it4->first]+=it4->second;
					else
						ngram_count[it4->first]=it4->second;
				}
			}
		}
	}

	map<SENT,int>::iterator it5;
	for(it5=ngram_count.begin();it5!=ngram_count.end();it5++)
	{
		const SENT & ngram=it5->first;
		SENT mgram;
		int p;
		double d;
		p=ngram.length();
		if (p>0)
			mgram=ngram.substr(0,p-1);
		else
			mgram.clear();
		if (mgram.length()>0)
			d=(double)ngram_count[ngram]/ngram_count[mgram];
		else
			d=(double)ngram_count[ngram]/tot_wrds;
		ngram_info[ngram]=-log(d)/log(2.0);
		//if (0) //showngram
		//{
		//	vector<string> wrds;
		//	split(ngram,wrds);
		//	_tprintf(_T("ngram info:%9.4f%6d%6d%8d%3d %s\n"), ngram_info[ngram], ngram_count[ngram],
		//		mgram.size()!=0 ? ngram_count[mgram] : tot_wrds, tot_wrds, wrds.size(), u2t(ngram).c_str());
		//}
	}
}

static void makelower(string & s)
{
	int i;
	for(i=0;i<(int)s.length();i++)
	{
		if (s[i]>='A' && s[i]<='Z')
			s[i]+='a'-'A';
	}
}

static void replace(string & s, const string t, const string r)
{
	int p;
	p=0;
	while (1)
	{
		p=s.find(t);
		if (p<0)
			return;
		s=s.substr(0,p)+r+s.substr(p+t.length());
	}
}

static int isEastAsia(wchar_t c)
{
	// all block characters for languages without space break between words
	return (
		(c>=0x0f00 && c<=0x0fff) // Tibetan
		|| (c>=0x1100 && c<=0x11ff) //Hangul Jamo
		|| (c>=0x2e80 && c<=0x2fd5) //CJK and KangXi Radicals
		|| (c>=0x2f00 && c<=0x2fe0) //KangXi Radicals
		|| (c>=0x2ff0 && c<=0x2ffb) //Ideographic Description
		|| (c>=0x3000 && c<=0x303f) //CJK Symbols and Punctuation
		|| (c>=0x3040 && c<=0x309f) //Hiragana
		|| (c>=0x30a0 && c<=0x30ff) //Katakana
		|| (c>=0x3100 && c<=0x312f) //Bopomofo
		|| (c>=0x3130 && c<=0x318f) //Hangul Compatibility Jamo
		|| (c>=0x3190 && c<=0x319f) //Kanbun
		|| (c>=0x31a0 && c<=0x31bf) //Bopomofo extended
		|| (c>=0x31c0 && c<=0x31ef) //CJK strokes
		|| (c>=0x31f0 && c<=0x31ff) //Katakana Phonetic Extensions
		|| (c>=0x3d00 && c<=0x4dff) //CJK Unified Ideographs Extension A
		|| (c>=0x4e00 && c<=0x9fff) //CJK Unified Ideographs
		|| (c>=0xa000 && c<=0xa48f) //Yi
		|| (c>=0xa490 && c<=0xa4cf) //Yi Radicals
		|| (c>=0xac00 && c<=0xd7a3) //Hangul Syllables
		|| (c>=0xf900 && c<=0xfaff) //CJK Compatibility Ideographs
		|| (c>=0xff00 && c<=0xffef) //Halfwidth and FullWidth Forms
		);
}

static void NormalizeText(string & s, const tstring & lang)
{
	replace(s,"<skipped>","");
	replace(s,"-\n","");
	replace(s,"\n"," ");
	replace(s,"\t"," ");
	replace(s,"&quot;","\"");
	replace(s,"&amp;","&");
	replace(s,"&lt;","<");
	replace(s,"&gt;",">");
	replace(s,"&apos;","\'");	//>< added by xiejun 20090618

//	if (!preserve_case)
//		makelower(s);
	vector<string> words;
	if (pwordsegmenter)
		pwordsegmenter(s,words,lang);
	else
		defaultwordsegmenter(s,words,lang);
	int i;
	s.clear();
	for(i=0;i<(int)words.size();i++)
	{
		if (i>0)
			s+=" ";
		s+=words[i];
	}

	//	_tprintf(_T("%s\n"),u2t(s).c_str());
	//	exit(0);
}

tstring get_source_info(DOCMAP & srcs, const tstring & fn)
{/*
	tstring src_lang;
	XMLDOC doc;
	xml_load(fn.c_str(),doc);

	XMLNODELIST sets;
	sets=xml_enum_elements(doc,"srcset");
	int i;
	for(i=0;i<(int)sets.size();i++)
	{
		src_lang=u2t(xml_get_attr(*sets[i],"srclang"));
		XMLNODELIST xdocs;
		xdocs=xml_enum_elements(sets[i]->childs,"doc");
		int j;
		for(j=0;j<(int)xdocs.size();j++)
		{
			string docid;
			docid=xml_get_attr(*xdocs[j],"docid");
			XMLNODELIST segs;
			segs=xml_enum_elements(xdocs[j]->childs,"s");
			int k;
			for(k=0;k<(int)segs.size();k++)
			{
				int sid;
				string sent;
				sid=wcstol(xml_get_attr(*segs[k],"id").c_str(),NULL,10);
				sent=xml_get_text(*segs[k]);
				NormalizeText(sent);
				srcs[docid][sid]=sent;
			}
		}
	}*/
	
	string data,tag,span,setdata;
	tstring src_lang;
	readtextfile(fn.c_str(),data,EC_AUTO,1);
	while (extract_tag_and_span("srcset",data,tag,setdata))
	{
		src_lang=u2t(extract_tag_attribute("srclang",tag));
		while (extract_tag_and_span(/*"doc"*/"DOC",setdata,tag,span))
		{
			string docid;
			string sent;
			docid=extract_tag_attribute("docid",tag);
			//		_tprintf(_T("%s\n"),docid.c_str());
			while (extract_tag_and_span(/*"s"*/"seg",span,tag,sent))
			{
				int sid;
				sid=strtol(extract_tag_attribute("id",tag).c_str(),NULL,10);
				//			_tprintf(_T("%d %s\n"),sid,sent.c_str());
				//			NormalizeText(sent);
				srcs[docid][sid].clear();
			}
		}
	}
	
	return src_lang;
}

tstring get_ref_data(const string & setid, SITEMAP & docs, VOCAB & voc, int preserve_case, const tstring & fn)
{
	string data,tag,setdata,docdata,span;
	tstring tgt_lang;
	int nv;
	voc.clear();
	nv=2;
	readtextfile(fn.c_str(),data,EC_AUTO,1);
	while (extract_tag_and_span(setid,data,tag,setdata))
	{
		tgt_lang=u2t(extract_tag_attribute("trglang",tag));
		while (extract_tag_and_span(/*"doc"*/"DOC",setdata,tag,docdata))
		{
			string docid;
			string sent;
			string site;
			//		SEGMAP sm;
			docid=extract_tag_attribute("docid",tag);
			site=extract_tag_attribute("site",tag);
			//		_tprintf(_T("%s\n"),docid.c_str());
			while (extract_tag_and_span(/*"s"*/"seg",docdata,tag,sent))
			{
				int sid;
				sid=strtol(extract_tag_attribute("id",tag).c_str(),NULL,10);
				//				_tprintf(_T("%d %s\n"),sid,sent.c_str());
				//			sm[sid]=sent;
				if (!preserve_case)
					makelower(sent);
				NormalizeText(sent,tgt_lang);
				vector<string> words;
				split(sent,words);
				////added by xiejun for info
				//copy(words.begin(),words.end(),ostream_iterator<string>(cout," "));
				//cout <<" ||| " << words.size() <<endl;

				int i;
				SENT & ss=docs[site][docid][sid];
				ss.clear();
				for(i=0;i<(int)words.size();i++)
				{
					map<string,int>::iterator it;
					it=voc.find(words[i]);
					if (it!=voc.end())
					{
						ss.push_back(it->second);
					}
					else
					{
						ss.push_back(nv);
						voc[words[i]]=nv;
						nv++;
					}
				}
			}
			//		docs[site][docid]=sm;
		}
	}

	return tgt_lang;
}

tstring get_tst_data(const string & setid, SITEMAP & docs, const VOCAB & voc, int preserve_case, const tstring & fn)
{
	string data,tag,setdata,docdata,span;
	tstring tgt_lang;
	readtextfile(fn.c_str(),data,EC_AUTO,1);
	while (extract_tag_and_span(setid,data,tag,setdata))
	{
		tgt_lang=u2t(extract_tag_attribute("trglang",tag));
		while (extract_tag_and_span(/*"doc"*/"DOC",setdata,tag,docdata))
		{
			string docid;
			string sent;
			string site;
			//		SEGMAP sm;
			docid=extract_tag_attribute("docid",tag);
			site=extract_tag_attribute("site",tag);
			//		_tprintf(_T("%s\n"),docid.c_str());
			while (extract_tag_and_span(/*"s"*/"seg",docdata,tag,sent))
			{
				int sid;
				sid=strtol(extract_tag_attribute("id",tag).c_str(),NULL,10);
				//				_tprintf(_T("%d %s\n"),sid,sent.c_str());
				//			sm[sid]=sent;
				if ( sent.find("<cand") != string::npos )
				{
					string cand_data;
					cand_data.clear();
					extract_tag_and_span("cand",sent, tag, cand_data);
					//cout << cand_data <<endl;
					sent.clear();
					sent = cand_data;
				}
				

				if (!preserve_case)
					makelower(sent);
				NormalizeText(sent,tgt_lang);
				//cout << sent <<endl;
				vector<string> words;
				split(sent,words);
				/*copy(words.begin(),words.end(),7ostream_iterator<string>(cout, " "));
				cout <<" ||| " << words.size() <<endl;*/
				int i;
				SENT & ss=docs[site][docid][sid];
				ss.clear();
				for(i=0;i<(int)words.size();i++)
				{
					map<string,int>::const_iterator it;
					it=voc.find(words[i]);
					if (it!=voc.end())
					{
						ss.push_back(it->second);
					}
					else
					{
						ss.push_back(1);
					}
				}
			}
			//		docs[site][docid]=sm;
		}
	}

	return tgt_lang;
}

tstring sgm_get_source_info(DOCMAP & srcs, const tstring & fn)
{
	string data,tag,span,setdata;
	tstring src_lang;
	readtextfile(fn.c_str(),data,EC_AUTO,0);
	while (extract_tag_and_span("srcset",data,tag,setdata))
	{
		src_lang=u2t(extract_tag_attribute("srclang",tag));
		while (extract_tag_and_span("DOC",setdata,tag,span))
		{
			string docid;
			string sent;
			docid=extract_tag_attribute("docid",tag);
			//		_tprintf(_T("%s\n"),docid.c_str());
			while (extract_tag_and_span("seg",span,tag,sent))
			{
				int sid;
				sid=strtol(extract_tag_attribute("id",tag).c_str(),NULL,10);
				//			_tprintf(_T("%d %s\n"),sid,sent.c_str());
				//			NormalizeText(sent);
				srcs[docid][sid].clear();
			}
		}
	}
	return src_lang;
}

tstring sgm_get_ref_data(const string & setid, SITEMAP & docs, VOCAB & voc, int preserve_case, const tstring & fn)
{
	string data,tag,setdata,docdata,span;
	tstring tgt_lang;
	int nv;
	nv=2;
	voc.clear();
	readtextfile(fn.c_str(),data,EC_AUTO,0);
	while (extract_tag_and_span(setid,data,tag,setdata))
	{
		tgt_lang=u2t(extract_tag_attribute("trglang",tag));
		while (extract_tag_and_span("DOC",setdata,tag,docdata))
		{
			string docid;
			string sent;
			string site;
			//		SEGMAP sm;
			docid=extract_tag_attribute("docid",tag);
			site=extract_tag_attribute("sysid",tag);
			//		_tprintf(_T("%s\n"),docid.c_str());
			while (extract_tag_and_span("seg",docdata,tag,sent))
			{
				int sid;
				sid=strtol(extract_tag_attribute("id",tag).c_str(),NULL,10);
				//				_tprintf(_T("%d %s\n"),sid,sent.c_str());
				//			sm[sid]=sent;
				if (!preserve_case)
					makelower(sent);
				NormalizeText(sent,tgt_lang);
				vector<string> words;
				split(sent,words);
				int i;
				SENT & ss=docs[site][docid][sid];
				ss.clear();
				for(i=0;i<(int)words.size();i++)
				{
					map<string,int>::iterator it;
					it=voc.find(words[i]);
					if (it!=voc.end())
					{
						ss.push_back(it->second);
					}
					else
					{
						ss.push_back(nv);
						voc[words[i]]=nv;
						nv++;
					}
				}
			}
			//		docs[site][docid]=sm;
		}
	}
	return tgt_lang;
}

tstring sgm_get_tst_data(const string & setid, SITEMAP & docs, const VOCAB & voc, int preserve_case, const tstring & fn)
{
	string data,tag,setdata,docdata,span;
	tstring tgt_lang;
	readtextfile(fn.c_str(),data,EC_AUTO,0);
	while (extract_tag_and_span(setid,data,tag,setdata))
	{
		tgt_lang=u2t(extract_tag_attribute("trglang",tag));
		while (extract_tag_and_span("DOC",setdata,tag,docdata))
		{
			string docid;
			string sent;
			string site;
			//		SEGMAP sm;
			docid=extract_tag_attribute("docid",tag);
			site=extract_tag_attribute("sysid",tag);
			//		_tprintf(_T("%s\n"),docid.c_str());
			while (extract_tag_and_span("seg",docdata,tag,sent))
			{
				int sid;
				sid=strtol(extract_tag_attribute("id",tag).c_str(),NULL,10);
				//				_tprintf(_T("%d %s\n"),sid,sent.c_str());
				//			sm[sid]=sent;
				if (!preserve_case)
					makelower(sent);
				NormalizeText(sent,tgt_lang);

				vector<string> words;
				split(sent,words);
				int i;
				SENT & ss=docs[site][docid][sid];
				ss.clear();
				for(i=0;i<(int)words.size();i++)
				{
					map<string,int>::const_iterator it;
					it=voc.find(words[i]);
					if (it!=voc.end())
					{
						ss.push_back(it->second);
					}
					else
					{
						ss.push_back(1);
					}
				}
			}
			//		docs[site][docid]=sm;
		}
	}
	return tgt_lang;
}
/*
void setdetail(int dt)
{
	detail=dt;
}
*/
//void setcase(int c)
//{
//	preserve_case=c;
//}

void setwordsegmenter(PWORDSEGMENTER p)
{
	pwordsegmenter=p;
}

void defaultwordsegmenter(const string & s, vector<string> & words, const tstring & lang)
{
	int w;
	wstring t;
	size_t i,xlen;
	int c,lc;
	w=0;
	xlen=s.length();
	i=0;
	c=-1;
	while(i<xlen)
	{
		lc=c;
		c=get_char(s,i);
		if (c>='A' && c<='Z' || c>='a' && c<='z' || c>='0' && c<='9')
		{
			if (w==0)
			{
				w=1;
			}
			t+=c;
		}
		else if (c>='{' && c<='~' || c>='[' && c<'`' 
			||c>' ' && c<='&' || c>='(' && c<='+'
			||c>=':' && c<='@' || c=='/')
		{
			w=0;
			t+=' ';
			t+=c;
			t+=' ';
		}
		else if (c=='.' || c==',')
		{
			size_t j;
			int nc;
			j=i;
			nc=get_char(s,j);
			if (lc!=-1 && (lc>='0' && lc<='9')
				&& nc!=-1 && (nc>='0' && nc<='9'))
			{
				t+=c;
			}
			else
			{
				w=0;
				t+=' ';
				t+=c;
				t+=' ';
			}
		}
		else if (lc>='0' && lc<='9' && c=='-')
		{
			w=0;
			t+=' ';
			t+=c;
			t+=' ';
		}
		else if (isEastAsia(c))
		{
			w=0;
			t+=' ';
			t+=c;
			t+=' ';
		}
		// this has been removed from mteval-v11b.pl
		/*
		else if (c==' ')
		{
		int u,v;
		for(u=i-1;u>=0;u--)
		{
		if (s[u]!=' ')
		break;
		}
		for(v=i+1;v<(int)s.length();v++)
		{
		if (s[v]!=' ')
		break;
		}
		if (u>=0 && v<(int)s.length() && s[u]>='0' && s[u]<='9' && s[v]>='0' && s[v]<='9')
		{
		t+="";
		}
		else
		{
		t+=c;
		}
		w=0;
		}
		*/
		else
		{
			w=0;
			t+=c;
		}
	}

	string x;
	w2u(x,t);

	replace(x,"  "," ");
	while (!x.empty() && x[0]==' ')
		x=x.substr(1);
	while (!x.empty() && x[x.length()-1]==' ')
		x=x.substr(0,x.length()-1);

	split(x,words);
}
