import java.io.*;
import java.net.URL;
import java.util.*;


/**
 * This class is responsible for extracting data from PDF documents.
 *
 * PDF document have character data embedded in them called Info Dictionaries.
 * These dictionaries are formatted as follows:
 * <pre>
 * 	&lt;&lt; /Author (The Author) /Title (The Title)
 *		/Subject (The Description) /Keywords (The Keywords) &gt;&gt;
 * </pre>
 *
 * @version $Id: PDFParser.java,v 1.13 2002/09/22 23:13:29 blsecres Exp $
 * @author Ben Secrest &lt;blsecres@users.sourceforge.net&gt;
 */
public class PDFParser implements FileParser, LinkExtractor {
    /** The default logging level for this module */
    private final static int LOGLEVEL = 9;

    /** File extensions for PDF files */
    private final static String[] extensions = {"pdf"};

    /** Mime types for PDF files */
    private final static String[] mimeTypes = {"application/pdf"};

    /** String representation of the type of files this parser works with */
    private final static String fileType = "PDF";

    /** PDF file magic signature */
    private final static byte[][] magic = {{'%', 'P', 'D', 'F', '-', '1', '.'}};

    /** PDF headers are always at the beginning of the file */
    private final static boolean magicOffset = false;

    /** PDF headers are case sensitive */
    private final static boolean magicCase = true;

    /** PDF FileMagic structure */
    private final static FileMagic pdfMagic = new FileMagic(magic, magicOffset,
	    magicCase);

    /** The character used to indicate a data tag */
    private final static char TAG_LEADER = '/';

    /** The character used to indicate start of data */
    private final static char DATA_START_DELIM = '(';

    /** The character used to indicate stop of data */
    private final static char DATA_STOP_DELIM = ')';


    // encodings for parser state machine
    private final static byte MAIN		= 0x00;
    private final static byte TAG_START		= 0x01;
    private final static byte DATA_START	= 0x02;
    private final static byte READ_DATA		= 0x03;
    private final static byte ESCAPED_CHAR	= 0x04;

    // encodings for meta tag type
    private final static byte NO_META	= 0x00;
    private final static byte AUTHOR	= 0x01;
    private final static byte TITLE	= 0x02;
    private final static byte DESCR	= 0x03;
    private final static byte KEYWORDS	= 0x04;
    private final static byte HREF	= 0x05;

    /** Determines if this parser will search for title */
    private boolean wantTitle;

    /** A parsed document's title */
    private String title;

    /** Determines if this parser will search for author */
    private boolean wantAuthor;

    /** A parsed document's author */
    private String author;

    /** Determines if this parser will search for description */
    private boolean wantDescription;

    /** A parsed document's description */
    private String description;

    /** Determines if this parser will search for keywords */
    private boolean wantKeywords;

    /** A parsed document's keywords */
    private String keywords;

    /** Determines if the parser will provide file type */
    private boolean wantFileType;

    /** Determines if the parser will provide parser information */
    private boolean wantParser;

    /** Determins if this parser will search for hyperlinks */
    private boolean wantURLs;

    /** A document's URL's */
    private HashSet hyperlinks;

    /** The logging object for this module */
    private IGLog log;


    /**
     * Construct a new PDFParser object with the given parsing options
     * @param logObj The object to use for logging data
     * @param extract The set of desired fields to extract
     */
    public PDFParser() {
	log = null;
	hyperlinks = null;

	wantTitle = wantAuthor = wantDescription = wantKeywords = wantURLs
	    = wantFileType = wantParser = false;
    }


    /**
     * Set the desired attributes to extract
     * @param wanted A set of bits describing preferences
     */
    public void setWantedItems(IGKeySet wanted) {
	wantTitle = wanted.wants(IGKey.TITLE);
	wantAuthor = wanted.wants(IGKey.AUTHOR);
	wantDescription = wanted.wants(IGKey.DESCRIPTION);
	wantKeywords = wanted.wants(IGKey.KEYWORDS);
	wantFileType = wanted.wants(IGKey.FILE_TYPE);
	wantParser = wanted.wants(IGKey.PARSER);
    }


    /**
     * Set the logger to use with this parser
     * @param logObj The object to use for logging data
     */
    public void setLog(IGLog logObj) {
	log = logObj;
    }


    /**
     * Parse a file extracting the required information
     * @param file The IGFile object to fill in information for
     * @throws IOException if an error occurs while reading data
     * @throws FileNotFoundException if the file given to be parsed does not
     * 	exist
     */
    public void parse(IGFile file) throws IOException, FileNotFoundException {
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "PDFParser.parse(IGFile)");

	try {
	    parse(file, new FileInputStream(file.getLocation()));
	} catch (StreamResetException sre) {
	    throw new IOException("Stream Reset Exception shouldn't happen");
	}
    }


    /**
     * Parse an opened steam
     * @param file The IGFile object to fill in information for
     * @param stream The input stream to read data from
     * @throws IOException if an error occurs reading data
     */
    public void parse(IGFile file, InputStream stream)
	    throws IOException, StreamResetException {
	if (log == null)
	    // FIXME
	    return;

	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "PDFParser.parse(IGFile, InputStream, "
		    + "String)");

	if (LOGLEVEL >= IGLog.FILE)
	    log.addResource(IGLog.FILE, "PROCESS_FILE",
		    new String[]{file.getLocation()});

	Reader reader = new BufferedReader(new InputStreamReader(stream));

	StreamTokenizer tokenizer = new StreamTokenizer(reader);

	/*
	 * if not searching for an item, assign it a value so the search can
	 * kick out when all items != null
	 */
	title = (wantTitle ? null : "");
	author = (wantAuthor ? null : "");
	description = (wantDescription ? null : "");
	keywords = (wantKeywords ? null : "");
	hyperlinks = (wantURLs ? new HashSet() : null);

	// configure the tokenizer
	tokenizer.eolIsSignificant(false);
	tokenizer.lowerCaseMode(false);		// ! convert words to lower
	tokenizer.ordinaryChar('"');		// ! use quote feature
	tokenizer.ordinaryChar('.');
	tokenizer.ordinaryChar('/');		// ! proc C-style comments
	tokenizer.ordinaryChar('\'');		// ! use quote feature
	tokenizer.ordinaryChars('0', '9');	// ! process numbers
	tokenizer.slashSlashComments(false);	// ! process comments
	tokenizer.slashStarComments(false);	// "                "
	tokenizer.wordChars('#', '&');		// everything except HTML
	tokenizer.wordChars('*', '.');		//   tag delimiters are
	tokenizer.wordChars('0', ';');		//   considered word characters
	tokenizer.wordChars('?', '[');
	tokenizer.wordChars(']', '~');

	boolean isSearching = true;		// still searching for data
	byte curState = MAIN;			// current FSM state
	byte nextState = MAIN;			// next FSM state
	byte curMeta = NO_META;			// current tag being parsed
	String curValue = "";			// current tag value

	if (LOGLEVEL >= IGLog.SECTION)
	    log.addResource(IGLog.SECTION, "FP_BEGIN_PARSE",
		    new String[]{fileType});

	while (isSearching && tokenizer.nextToken() != StreamTokenizer.TT_EOF) {
/*
	    if (LOGLEVEL >= IGLog.DEBUG)
		log.add(IGLog.DEBUG, "State: " + curState + ' ' +
			tokenizer.toString());
*/
	    switch (curState) {
	    /*
	     * main state, consume tokens until the start of a PDF tag
	     */
	    case MAIN :
		if (tokenizer.ttype == TAG_LEADER)
		    nextState = TAG_START;
		break;
	    /*
	     * a tag has been started, examine its type and
	     * transition accordingly
	     */
     	    case TAG_START :
		if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
		    if (author == null
			    && tokenizer.sval.equalsIgnoreCase("author")) {
			nextState = DATA_START;
			curMeta = AUTHOR;
		    } else if (title == null
			    && tokenizer.sval.equalsIgnoreCase("title")) {
			nextState = DATA_START;
			curMeta = TITLE;
		    } else if (description == null
			    && tokenizer.sval.equalsIgnoreCase("subject")) {
			nextState = DATA_START;
			curMeta = DESCR;
		    } else if (keywords == null
			    && tokenizer.sval.equalsIgnoreCase("keywords")) {
			nextState = DATA_START;
			curMeta = KEYWORDS;
		    } else if (wantURLs
			    && (tokenizer.sval.equalsIgnoreCase("f")
				|| tokenizer.sval.equalsIgnoreCase("uri"))) {
			nextState = DATA_START;
			curMeta = HREF;
		    } else
			nextState = MAIN;
		} else
		    nextState = MAIN;
		break;
	    /*
	     * A tag the parser is interested in has been found, search for
	     * start-of-date delimiter
	     */
	    case DATA_START :
		switch (tokenizer.ttype) {
		case DATA_START_DELIM :
		    nextState = READ_DATA;
		    break;
		case TAG_LEADER :
		    nextState = TAG_START;
		    break;
		default :
		    nextState = MAIN;
		    break;
		}
		break;
	    /*
	     * Inside a set of data delimiters, read and save data until an
	     * end-of-data delimiter is encountered
	     */
	    case READ_DATA :
		switch (tokenizer.ttype) {
		case DATA_STOP_DELIM :
		    // each section can have a title, author, etc only the
		    // first one found (hopefully the document's) will be used
		    switch (curMeta) {
		    case AUTHOR :
			if (author == null) {
			    author = curValue.trim();
			    if (LOGLEVEL >= IGLog.PROGRESS)
				log.addResource(IGLog.PROGRESS,
					"FP_FOUND_AUTHOR",
					new String[]{author});
			}
			break;
		    case TITLE :
			if (title == null) {
			    title = curValue.trim();
			    if (LOGLEVEL >= IGLog.PROGRESS)
				log.addResource(IGLog.PROGRESS,
					"FP_FOUND_TITLE", new String[]{title});
			}
			break;
		    case DESCR :
			if (description == null) {
			    description = curValue.trim();
			    if (LOGLEVEL >= IGLog.PROGRESS)
				log.addResource(IGLog.PROGRESS, "FP_FOUND_DESC",
					new String[]{description});
			}
			break;
		    case KEYWORDS :
			if (keywords == null) {
			    keywords = curValue.trim();
			    if (LOGLEVEL >= IGLog.PROGRESS)
				log.addResource(IGLog.PROGRESS,
					"FP_FOUND_KEYWORDS",
					new String[]{keywords});
			}
			break;
		    case HREF :
			if (LOGLEVEL >= IGLog.PROGRESS)
			    log.addResource(IGLog.PROGRESS, "EXTRACT_HYPERLINK",
				    new String[]{curValue});
			hyperlinks.add(curValue.trim());
			break;
		    }
		    curValue = "";
		    curMeta = NO_META;
		    nextState = MAIN;;
		    break;
		case StreamTokenizer.TT_WORD :
		    curValue += tokenizer.sval + (curMeta == HREF ? "" : " ");
		    break;
		case '\\' :
		    nextState = ESCAPED_CHAR;
		    break;
		default :
		    curValue += new Character((char) tokenizer.ttype);
		    break;
		}
		break;
	    case ESCAPED_CHAR :
		/*
		 * remove first three characters from sval
		 * these characters are an escaped octal value
		 * transform ascii/octal to actual char and add to
		 * data?
		 */
		curValue += tokenizer.sval.substring(3) + ' ';
		nextState = READ_DATA;
		break;
	    }

	    curState = nextState;

	    if (title != null && author != null && description != null
		    && keywords != null && ! wantURLs)
		isSearching = false;
	}

	reader.close();

	if (wantTitle)
	    file.put(IGKey.TITLE, title);
	if (wantAuthor)
	    file.put(IGKey.AUTHOR, author);
	if (wantDescription)
	    file.put(IGKey.DESCRIPTION, description);
	if (wantKeywords)
	    file.put(IGKey.KEYWORDS, keywords);
	if (wantFileType)
	    file.put(IGKey.FILE_TYPE, fileType);
	if (wantParser)
	    file.put(IGKey.PARSER, getClass().getName());
	
	if (wantURLs) {
	    file.put(IGKey.URLS, hyperlinks);
	    hyperlinks = null;
	}

	if (LOGLEVEL >= IGLog.SECTION)
	    log.addResource(IGLog.SECTION, "FP_FINISH_PARSE",
			    new String[]{fileType});
    }


    /**
     * Extract hyper-links from the PDFFile
     * @param file The file to extract links from
     */
    public String[] getLinks(IGFile file) {
	HashSet urls = (HashSet) file.get(IGKey.URLS);
	file.remove(IGKey.URLS);
	return IGMisc.hashSetToStringArray(urls);
    }


    /**
     * Instruct the parser whether or not links should be collected
     * @param pref If the preference is <tt>true</tt>, links will be collected.
     * 	If <tt>false</tt>, no links will be collected.
     */
    public void wantURLs(boolean pref) {
	wantURLs = pref;
    }


    /**
     * Supply extensions this parser can handle
     * @return String array of file extensions
     */
    public String[] getExtensions() {
	return extensions;
    }


    /**
     * Supply mime types this parser can handle
     * @return String array of mime types
     */
    public String[] getMimeTypes() {
	return mimeTypes;
    }


    /**
     * Supply file magic for files this parser can handle
     * @return Array of byte arrays containing magic signature
     */
    public FileMagic getMagic() {
	return pdfMagic;
    }
}
