package org.cnlp.apachecon.search;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.QueryTermVector;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;

import gihyo.lucene.ch3.BookIndexer;

/**
 * Copyright 2005 Center for Natural Language Processing
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *  http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

/**
 *
 *
 **/
public class QAService
{

    private static final Comparator qaResultComparator = new QAResultComparator();

    /**
     * Identify possible candidates by using a set of SpanQuerys.  Create a SpanNearQuery that wraps all of the 
     * keywords in the query
     * @param question The original question
     * @param analyzer The Analyzer to use on the Question
     * @param indexReader The IndexReader for the Index to search against
     * @return A List of {@link QAResult}s
     * @throws IOException
     */
    public List getCandidates(String question, Analyzer analyzer, IndexReader indexReader) throws IOException
    {
        List candidates = new ArrayList();
        TokenStream queryStream = analyzer.tokenStream("", new StringReader(question));
        Token token = null;
        List queryTokens = new ArrayList();
        while ((token = queryStream.next()) != null)
        {
            queryTokens.add(token);//We need the tokens in order, whereas the TermVector returns them sorted lexicographically
        }
        QueryTermVector queryVec = new QueryTermVector(question, analyzer);//Use the vector to get the Boost values and remove duplicates
        String [] terms = queryVec.getTerms();
        int [] freqs = queryVec.getTermFrequencies();
        SpanQuery [] clauses = new SpanQuery[terms.length];
        for (int i = 0; i < terms.length; i++)
        {
            String term = terms[i];
            SpanTermQuery termQuery = new SpanTermQuery(new Term(BookIndexer.F_SUMMARY, term));
            termQuery.setBoost(freqs[i]);
            clauses[i] = termQuery;
        }
        //use a large window (slop) to span multiple sentences.  This could be fine-tuned
        //by looking at average sentence length, or average document length, or based on what size windows are of interest
        //We could also use a SpanOrQuery to have a higher recall, but maybe less precision
        SpanQuery query = new SpanNearQuery(clauses, 15, false);
        Spans spans = query.getSpans(indexReader);
        //we need to see which keywords matched against which documents and 
        //score them accordingly
        System.out.println("Terms: " + query.getTerms());
        Map docToSpans = new HashMap();

        while (spans.next() == true)
        {
            //Document doc = indexReader.document(spans.doc());
            Integer docId = new Integer(spans.doc());
            LinkedList tmp = (LinkedList) docToSpans.get(docId);
            if (tmp == null)
            {
                tmp = new LinkedList();
                docToSpans.put(docId, tmp);
            }
            System.out.println("Doc: " + spans.doc() + " Start: " + spans.start() + " End: " + spans.end());
            tmp.add(new SpanPosInfo(spans.start(), spans.end()));//store up the spansfor each document so we only have to analyze the document once
        }
        for (Iterator iterator = docToSpans.entrySet().iterator(); iterator.hasNext();)
        {
            Map.Entry entry = (Map.Entry) iterator.next();
            int docId = ((Integer) entry.getKey()).intValue();
            Document doc = indexReader.document(docId);
            LinkedList list = (LinkedList) entry.getValue();

            String docTitle = doc.get(BookIndexer.F_TITLE);            //get its title
            String isbn = doc.get(BookIndexer.F_ISBN);                   //get its isbn field
            System.out.println("Analyzing doc: " + entry.getKey() + " ISBN: " + isbn);
            if ((docTitle == null) || docTitle.equals("")) //use the isbn if it has no title
            {
                docTitle = isbn;
            }
            String content = doc.get(BookIndexer.F_SUMMARY);
            //We need to determine where the span occurs in the content
            int startOffset = -1;
            int endOffset = -1;

            TokenStream stream = analyzer.tokenStream(BookIndexer.F_SUMMARY, new StringReader(content));


            SpanPosInfo info = null;
            int pos = 0;
            while ((token = stream.next()) != null)
            {
		//System.out.println( "*** termText = " + token.termText() );
                pos += token.getPositionIncrement();
                if (info == null)
                {
                    if (list.isEmpty() == false)
                    {
                        info = (SpanPosInfo) list.removeFirst();
                    }
                    else
                    {
                        break;//no more left
                    }
                }
                //spans are in order from start position, so treat the list like a stack
                if (info != null)
                {
		    //System.out.println( "pos = " + pos + ", info.startPos = " + info.startPos );
                    //see if we have a match
                    //get 5 tokens in front of this position just so we have a bigger window.  A better way would be to 
                    //get the surrounding sentences for the sentence(s) that this span hits
		    /*****************************************************
		     * 0$B$r(B1$B$KJQ99$7$?!*!*!*(B
		    *****************************************************/
                    if (pos == Math.max(1, info.startPos - 3))
                    {
                        QAResult result = new QAResult(docId, docTitle, isbn);//we have a candidate
                        result.addToken(token);
                        startOffset = token.startOffset();
                        //skip ahead until we find the end position
                        while ((token = stream.next()) != null)
                        {
                            pos += token.getPositionIncrement();
                            result.addToken(token);
                            if (pos == info.endPos + 3)
                            {
                                endOffset = token.endOffset();
                                break;
                            }
                        }
                        if (endOffset == -1)
                        {
                            endOffset = pos;
                        }
                        
                        //We could do something more sophisticated to make sure we return whole sentences
                        String candidate = content.substring(startOffset, endOffset);
                        System.out.println("Candidate string: " + candidate);
                        result.setCandidate(candidate);
                        result.setStartOffset(startOffset);
                        result.setEndOffset(endOffset);
                        score(queryTokens, queryVec, result);
                        candidates.add(result);
                        info = null;
                        startOffset = -1;
                        endOffset = -1;

                    }
                }
            }
        }
        //1 point for each keyword in the span

        //Sort them by score
        Collections.sort(candidates, qaResultComparator);
        System.out.println("Terms: " + query.getTerms());
        System.out.println("Candidates: " + candidates.size());
        return candidates;
    }

    /**
     * Score the candidates 
     * @param queryTokens The {@link org.apache.lucene.analysis.Token}s from the query, in order
     * @param queryVec The TermVector of the Query.
     * @param result The {@link QAResult} that we are scoring
     */
    private void score(List queryTokens, QueryTermVector queryVec, QAResult result)
    {
        //Make up some scoring algorithm based on keywords, order, etc.
        float score = 0;
        //First give some points for how many keywords are in the result and boost it based on the frequency of the term in the query
        String [] terms = queryVec.getTerms();
        int [] freqs = queryVec.getTermFrequencies();
        String cand = result.getCandidate().toLowerCase();
        boolean seenAll = true;
        for (int i = 0; i < terms.length; i++)
        {
            String term = terms[i];
            int seen = 0;
            int index = -1;
            
            while ((index = cand.indexOf(term, index + 1)) != -1)
            {
                seen += 1;//one point every time we see the term in the candidate
            }
            if (seen == 0)
            {
                seenAll = false;
            }
            score += seen * freqs[i]; //boost by the frequency
        }
        //Give a boost if the candidate has all the terms, these boost values would be parameterized in a real system
        if (seenAll == true)
        {
            System.out.println("Has all terms!");
            score *= 2;
        }
        //could do something like see how much distance between terms
        
        //Now give a boost if the terms appear in order
        boolean inOrder = true;
        int lastIndex = 0;
        for (Iterator iterator = queryTokens.iterator(); iterator.hasNext();)
        {
            Token queryToken = (Token) iterator.next();
            String text = queryToken.termText();
            int index = cand.indexOf(text);
            if (index < lastIndex)
            {
                inOrder = false;
                break;
            }
            lastIndex = index;            
        }
        if (inOrder == true)
        {
            System.out.println("In Order!");
            score *= 2;
        }
        //normalize the score based on the length of the candidate, shorter answers are better
        result.setScore(score/result.getTokens().size());
    }

    private class SpanPosInfo
    {
        int startPos;
        int endPos;

        public SpanPosInfo(int startPos, int endPos)
        {
            this.startPos = startPos;
            this.endPos = endPos;
        }
    }
}            
