package nayami.indexer;

/**
 * Copyright 2006 SEKIGUCHI, Koji
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *  http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

import java.io.IOException;
import java.io.File;
import java.io.InputStream;
import java.io.FileInputStream;
import java.util.Iterator;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.apache.html.dom.HTMLDocumentImpl;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import nayami.constant.DocumentTypes;

public class FaqSourceIterator extends AbstractFileSourceIterator {

    private static Log log = LogFactory.getLog( FaqSourceIterator.class );

    private static final String FILE_EXTENSION = ".html";

    protected String getFileExtension(){
	return FILE_EXTENSION;
    }

    public boolean hasNext(){
	return getIterator().hasNext();
    }

    public Document next(){
	Document doc = new Document();
	RelativeFile rf = (RelativeFile)getIterator().next();
	doc.add( getIdField( rf.getRelativeFilePath() ) );
	doc.add( getDateField( rf.getFile().lastModified() ) );
	doc.add( getTypeField( DocumentTypes.FAQ ) );
	doc.add( getUrlField( getUrlPrefix() +
			      rf.getRelativePath() +
			      rf.getFile().getName() ) );
	DocumentFragment fragment = parseHtml( rf.getFile() );
	if( fragment == null )
	    return null;
	doc.add( getTitleField( getTitleText( fragment ) ) );
	doc.add( getContentField( getBodyText( fragment ) ) );
	return doc;
    }

    protected DocumentFragment parseHtml( File html ){
	DocumentFragment fragment =
	    new HTMLDocumentImpl().createDocumentFragment();
	DOMFragmentParser parser = new DOMFragmentParser();
	InputStream is = null;
	try{
	    is = new FileInputStream( html );
	    parser.parse( new InputSource( is ), fragment );
	}
	catch( IOException e ){
	    log.error( "error during reading html. error = " + e.toString() );
	    return null;
	}
	catch( SAXException e ){
	    log.error( "error during parsing html. error = " + e.toString() );
	    return null;
	}
	finally{
	    try{
		if( is != null ) is.close();
	    }
	    catch( IOException e ){
		log.error( "error during closing html. error = " + e.toString() );
	    }
	}
	return fragment;
    }

    protected String getTitleText( Node node ){
	StringBuffer title = new StringBuffer();
	getText( title, node, "title" );
	return title.toString();
    }

    protected String getBodyText( Node node ){
	StringBuffer body = new StringBuffer();
	getText( body, node, "body" );
	return body.toString();
    }

    // To implement this method, "Lucene in Action" is used as reference
    protected void getText( StringBuffer sb, Node node ){
	if( node.getNodeType() == Node.TEXT_NODE )
	    sb.append( node.getNodeValue() );
	NodeList nl = node.getChildNodes();
	if( nl != null ){
	    int len = nl.getLength();
	    for( int i = 0; i < len; i++ )
		getText( sb, nl.item( i ) );
	}
    }

    // To implement this method, "Lucene in Action" is used as reference
    protected boolean getText( StringBuffer sb, Node node, String element ){
	if( node.getNodeType() == Node.ELEMENT_NODE ){
	    if( element.equalsIgnoreCase( node.getNodeName() ) ){
		getText( sb, node );
		return true;
	    }
	}
	NodeList nl = node.getChildNodes();
	if( nl != null ){
	    int len = nl.getLength();
	    for( int i = 0; i < len; i++ ){
		if( getText( sb, nl.item( i ), element ) )
		    return true;
	    }
	}
	return false;
    }
}
