package gihyo.lucene.ch6;

/**
 * Copyright 2006 SEKIGUCHI, Koji
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Comparator;
import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

public class TestTermFreqVector {

    static final String TEXT = "i͂߂Ɂj̓x̑Ǐʂ󂯁AOxi݂сjAtb̏dӂSƂɂȂ܂BuvȂĐȂvAuԂɂł邱Ƃ͖ԂɁvAunɂł邱Ƃ͒nɁvƂ̕j̉AR}yь}ɂÄ肵ՂɗāA\vfsoł܂B͏ACȗA䂪̍ĐƔWɌAZAŐAKAΏoɂ킽L͈͂ȍ\vi߂Ă܂܂B̌ʁA{oς́AsǍ̏ڕWA{̍oɗ邱ƂȂAԎ哱̌iC񕜂ւ̓ݎn߂܂Bv̉肪lXȕő傫Ȗ؂Ɉ炿錻݁Av~iƁj߂Ă͂Ȃ܂BiXcƍ\v̉jvi߂ĂہA{Iȕj͎x̂ɁAʂ̘̋_ɓƁAv̕ǂɂԂ荪΂ɒʂ܂B̑_^Ae_΂̓T^ŁAłȒ傫ƂĒNN悤ƂȂ̂AXcł܂Bu炵ȂvAusvfsȂvAuԂɂł邱Ƃ͖ԂɁvÅ{jɂ͑̐lX^̂ɁAȂAXƂ͌łȂ΂łȂ̂AԐlɔCȂ̂ł傤B̍ɂāAXc֘A@Ă͔ی܂B̂ߎ́A{ɍc͕KvȂƔfĂ̂Aڂ̈ӎvmFƎvAOc@U܂BXćA܂ɍsAAoρAZȂǂ镪̍\vɂȂuv̖{ہvƊmM邩ł܂BXc̐񂪖ꂽ̓x̑IɂāAɎ^鎩R}yь}́A̍̐MC܂B́A̖ӂ傫ȎxƂāA߂ėXc֘A@ĂoA\鍑ŌRcAӂł܂BXƂ́AQUl̏΂̍ƌiĂ܂B̈SƈSǂŠx@QTlAECE󂷂ׂĂ̎q͂QSlAĉւƑSES\̍݊OقɋΖĂOȐEɎĂ͂Ulɂyт܂BXƂ^cKv̂ł傤BXƂ𖯉c΁AnӍHvƒmbɂAlłǂiT[rXWJƎv܂B̑؂ȎY𖯊ԌƂĊp邱Ƃ́Aoς̊ɂȂ܂B]ƏĂ@lœ̎x⊔̔pȂǂɂAČɂv܂BXćAȑfŌIȐ{̎̂ł܂B̊Ԃɂ́Acɂāuߑan̗X֋ǂȂȂ̂ł͂ȂvAuX֋ǂŒیȂȂ̂ł͂ȂvƂŝ݂mĂ܂B̋MdȎYłX֋ǂ̃lbg[NێA̗ւɎxႪȂ悤ɂ܂B́uv̗Xcł͂ȂAuov̐{nZ@ւ̉vɎgł܂܂Bunɂł邱Ƃ͒nɁvƂj̉AS~x̕⏕vAR~K͂ڎwŌڏAntł̌̎Oʈ̂̉vɂāAn̈ӌ^Ɏ󂯎~߁ANx܂łɊmɎ܂BtȗAƔS팸ȂǁAɂPO~ɏΏovfs܂BQOPON㏉ɂ́AIȎxoVȎ؋ɗ炸ɂ̔Nx̐ŎŘd悤A\vɑS͂Ŏg݂܂Bƌ̋^ɊւAsƒnꂼŖԂ̋^Ԃɍ킹Ȃǋ^̌nƂƂɁAƌ̒̏ڕWݒ肵Al̍팸s܂B́Â悤ȍ\vfsA{̋K͂_ɏkĂ܂܂BïSƈSjqiމ䂪́A{iIȐlЉڑOɍTĂAq⑷̐ɕS摗肷邱ƂȂAlЂƂ肪LȐ𑗂邱Ƃł銈͂Љ\zĂȂ΂Ȃ܂BNAÁA𒌂ƂЉۏᐧx́AxՂłB̏ɑ΂sĂ߂ɂ́AKȋtƕSŎ\ȐxƂ邱Ƃ̐ӔCłƍl܂BƂ킯Nx́AIȎɗĉvi߂KvA^}݂JċcsAӌ̑𖄂߂w͂邱ƂsłB̑䕗Ȃǂ̍ЊQč암ł̃nP[ɂQɑꂽXɑ΂AS炨\グ܂BЎ҂S𑗂悤A̔Вn̕ƕɖSƂƂɁAz̑ϐk𑣐iȂǍЊQɋÂi߂Ă܂܂BQ̊g傪OAXxXgɑΏ邽߁AQҋ~ϑ΍AXxXg̑SȏȂǂɐ{Ďgł܂܂BiE̒̓{j䂪̈SƔɉhɂ́AE̕aƈ肪܂BēƍۋO̊{ƂāAێЉ̐ӔCƂĂ̖ϋɓIɉʂĂ܂܂BێЉ͍Ar㍑̊Jn̍An̕ۑSAʔj󕺊̊gUh~ȂǁAĂ͑złȂ悤ȕGȉۑɒʂĂ܂B́A̍AŁARς鏔ɑ΂uʓIɋ@\鍑AvKvł邱Ƃi܂Ble̗Ƌ͂𓾂ȂASۏᗝ̉vȂǍA̋ɌđS͂s܂BeƂ̓͏IĂ܂Be΍ʑ[u@̊̉}ȂǁAێЉƋ͂ăe̖h~EɎg݂܂BCNł́A̎ŕaȖ卑ƂݒłA䂪̎Ǝqɂlx́AZ̍]󂯂Ă܂B̎q̊ɂẮACN̗v]⍑ۏ𓥂܂An̏󋵂悭ɂ߂ŔfĂ܂܂B؍n߂Ƃߗ׏Ƃ́ALɂ鋦͂AݗƐMɊÂu̗FD֌W\zĂ܂܂BkNƂ̊Ԃł́AfvAjA~TC̖Iɉč𐳏퉻ڎw܂Bi̍́AEL̗A䂪͂ƂAAWAȂǂɑ傫ȉeyڂƂO܂B䂪́A~ĂΖoȂǍێЉɍvĂ܂AĂѐΖ@N邱Ƃ̂Ȃ悤AeƋٖɋ͂Ă܂܂B񍑊Ԃ̌oϘAgϋɓIɐi߂ƂƂɁAvsnVEh̍ŏIӂɌĐ͓IɎg݂܂Biނсj͍Ŝ̂̂łAꕔ̊v̂łĂ͂Ȃ܂B܂ŗXću\_vł͂ȂƂ̎wE܂AǏʁÁu_vłƂ̐RƎv܂B́A̐lɎ󂯎~߁AӔCėXcĂ܂܂Bɂ݂ꂸAv̕ǂɂЂ܂Aߋ̊ɂƂꂸA̋͂̉Ag𓊂oAtb̐EӂʂׂAS͂sĂ܂܂BvȂĖ͂܂B̎xȂĉv͎sł܂Bv͍̌͂lЂƂłAv邩ۂ́A̋ӎvƐƂ̒fłs͂ɂĂ܂B{Љɂ́AVɒ킷ӗ~Ɓu΂łvƂM萶Ă܂Bv~iƁj߂邱ƂȂAECƏMāA{̖邢zł͂܂񂩁Bтɋceʂ̌䋦͂S炨肢\グ܂B";
    static final String FIELD = "F";
    static final Directory directory = new RAMDirectory();
    static final Analyzer analyzer = new JapaneseAnalyzer();

    public static void main( String[] args ) throws IOException {
	makeIndex();
	printTermFreqVector();
    }

    static void makeIndex() throws IOException {
	IndexWriter writer = new IndexWriter( directory, analyzer, true );
	Document doc = new Document();
	doc.add( new Field( FIELD, TEXT, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES ) );
	writer.addDocument( doc );
	writer.close();
    }

    static void printTermFreqVector() throws IOException {
	IndexReader reader = IndexReader.open( directory );
	TermFreqVector tfv = reader.getTermFreqVector( 0, FIELD );
	List ntfv = getNormalizedTermFreqVector( tfv );
	for( Iterator i = ntfv.iterator(); i.hasNext(); ){
	    TermFreq tf = (TermFreq)i.next();
	    int freq = tf.getFreq();
	    String term = tf.getTerm();
	    System.out.println( term + " :\t" + freq );
	}
	reader.close();
    }

    static List getNormalizedTermFreqVector( final TermFreqVector tfv ){
	List list = new ArrayList();
	for( int i = 0; i < tfv.size(); i++ ){
	    int freq = tfv.getTermFrequencies()[i];
	    String term = tfv.getTerms()[i];
	    char c = term.charAt( 0 );
	    if( freq > 4 && ( !isHiragana( c ) || term.length() > 3 ) )
		list.add( new TermFreq( term, freq ) );
	}
	Collections.sort( list, new TermFreqComparator() );
	return list;
    }

    static boolean isHiragana( final char c ){
	return c >= '' && c <= '';
    }

    static class TermFreq {
	String term;
	int freq;
	TermFreq( String term, int freq ){
	    this.term = term;
	    this.freq = freq;
	}
	String getTerm(){ return term; }
	int getFreq(){ return freq; }
    }

    static class TermFreqComparator implements Comparator {
	public int compare( Object o1, Object o2 ){
	    int f1 = ((TermFreq)o1).getFreq();
	    int f2 = ((TermFreq)o2).getFreq();
	    return f1 < f2 ? 1 : -1;
	}
	public boolean equals( Object other ){
	    return other instanceof TermFreqComparator;
	}
    }
}
