/**
 * pdfXtk-Extras - PDF Extraction Toolkit Extras
 * Copyright (c) by the authors/contributors.  All rights reserved.
 * This project includes code from PDFBox and TouchGraph.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * http://pdfxtk.sourceforge.net
 *
 */
package at.ac.tuwien.dbai.pdfwrap;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

import javax.imageio.ImageIO;

import at.ac.tuwien.dbai.pdfwrap.analysis.CandidateCluster;
import at.ac.tuwien.dbai.pdfwrap.analysis.LineProcessor;
import at.ac.tuwien.dbai.pdfwrap.analysis.PageProcessor;
import at.ac.tuwien.dbai.pdfwrap.analysis.PageSegmenter;
import at.ac.tuwien.dbai.pdfwrap.comparators.XYTextComparatorOCR;
import at.ac.tuwien.dbai.pdfwrap.comparators.YComparator;
import at.ac.tuwien.dbai.pdfwrap.model.document.*;
import at.ac.tuwien.dbai.pdfwrap.model.graph.*;
import at.ac.tuwien.dbai.pdfwrap.ocr.SegmentExtractor;
import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFPage;
import at.ac.tuwien.dbai.pdfwrap.table.CandidateColumn;
import at.ac.tuwien.dbai.pdfwrap.table.CandidateTable;
import at.ac.tuwien.dbai.pdfwrap.table.OrderedTable;
import at.ac.tuwien.dbai.pdfwrap.table.TableColumn;
import at.ac.tuwien.dbai.pdfwrap.table.TableFinder;
import at.ac.tuwien.dbai.pdfwrap.utils.ListUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.SegmentUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.Utils;

import com.javazoid.functions.FileFunctions;

/**
 * General class to take a PDFPage and return a processed Page object,
 * according to the processType
 *
 * @author Tamir Hassan, hassan@dbai.tuwien.ac.at
 * @version PDF Analyser 0.9
 */
public class CustomPageProcessor extends PageProcessor
{

	public final static int PP_COLUMN = 6;
//	public final static int PP_REFINED = 7;
	
	public final static int PP_TABLE = 8;
	public final static int PP_NR_TABLE = 9;
	public final static int PP_R_TABLE = 10;
    
	protected List<CandidateColumn> candCols;
	protected List<TableColumn> cols;
	
	protected BufferedImage pageImage = null;
	protected boolean performOCR = false;
	protected boolean isOCR = false;
	
    /**
     * Instantiate a new CustomPageProcessor object.
     */
    public CustomPageProcessor() // throws IOException -- I don't think there's any need for it now
    {
        // super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
    }
    
    public CustomPageProcessor(int processType) // throws IOException -- I don't think there's any need for it now
    {
        // super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
    	this.processType = processType;
    }
    
    public Page processPage(PDFPage thisPage) // throws IOException
    {
    	if (pageImage != null)
        	processPageImage();
//    	System.out.println("in processPage");
    	Page retVal = doProcessPage(thisPage);
    	// custom processing goes here
    	if (pageImage != null)
			callOCR();
    	customProcessing(processType);
    	postProcessing(processType, retVal);
    	
//    	System.out.println("end of processPage");
//    	ListUtils.printList(retVal.getItems());
    	return retVal;
    }

	public void customProcessing(int processType) // for custom processing, override this method!
    {
    	if (true) // not PP_FRAG, PP_CHAR, etc. (is not executed anyway...)
//    		PP_STRUCT is a separate class!
    	{
        	if ((processType == PP_COLUMN || processType == PP_NR_TABLE || processType == PP_R_TABLE ||
	        	processType == PP_TABLE) && mergedLines.size() > 0)
	        {
	        	if (processType == PP_R_TABLE || processType == PP_TABLE)
	        	{
		        	// RULED TABLE DETECTION
		        	
		        	// atomic lines: blocks, lines
		        	// R table: blocks, rtable
		        	// NR table: b/cols, nrtable
		        	// table: blocks, rtable, b/cols, nrtable
	            	
	        		List<OrderedTable> ruledTables = TableFinder.findRuledTables(textBlocks, lineList);
	        		
//	        		System.out.println("found ruled tables:");
//	        		ListUtils.printList(ruledTables);
	        		
		        	processingResult.addAll(ruledTables);
		        	
		        	// remove all text objects that intersect the found tables
		        	List<TextBlock> segmentsToRemove = new ArrayList<TextBlock>();
		        	for (TextBlock tb : textBlocks)
		        		for (OrderedTable t : ruledTables)
		        			if (SegmentUtils.intersects(tb, t)) 
		        				segmentsToRemove.add(tb);
		        	textBlocks.removeAll(segmentsToRemove);
	        	}
        		
	        	if (processType == PP_COLUMN || processType == PP_NR_TABLE || //processType == PP_R_TABLE ||
	    	        processType == PP_TABLE)
	        	{	
	        		
		            /*
	            	// FIND MODAL FONT SIZE
	        		
	            	ArrayList<Float> sizes = new ArrayList<Float>();
	            	for (TextBlock tb : mergedLines)
	            	{
        	    		Float newSizeObject = new Float(tb.getFontSize());
        	    		sizes.add(newSizeObject);
	            	}
	            	float modalFontSize = Utils.findMode(sizes, 1.0f);

	            	// SPLIT TEXT LINES INTO CANDIDATE COL SEGMENTS AND
	            	// HEADING SEGMENTS BASED ON FONTSIZE
	            	List<TextLine> candColSegments = new ArrayList<TextLine>();
	            	List<TextLine> headingSegments = new ArrayList<TextLine>();
	            	for (TextLine tl : textLines)
	            	{
	            		if (tl.getFontSize() > modalFontSize * 1.5f)
	            			headingSegments.add(tl);
	            		else
	            			candColSegments.add(tl);
	            	}
		            */
	            	
//	        		System.out.println("text blocks found:");
//	            	ListUtils.printList(textBlocks);
	            	
	            	// TODO: REMOVE RULED TABLES FIRST!
	            	
	            	// BUILD NEIGHBOURHOOD GRAPH
	            	
	            	AdjacencyGraph<TextBlock> ag = new AdjacencyGraph<TextBlock>();
	            	ag.addList(textBlocks);
	            	ag.generateEdgesSingle();
	            	
	            	// FIND CANDIDATE COLUMNS after removal of ruled tables
	            	// TODO: remove hashMap as parameter? WHY???
	            	HashMap<GenericSegment, CandidateCluster> colHash = 
	    	        	new HashMap<GenericSegment, CandidateCluster>();
	            	candCols = TableFinder.clusterTextBlocksIntoCandidateColumns(ag, 0, colHash);//maxIterations, clustHash)
	            	
//	            	System.out.println("cols found:");
//	            	ListUtils.printList(candCols);
	            	
	        	}
	        	
		        if (processType == PP_NR_TABLE || processType == PP_TABLE)
	            {	
//		        	System.out.println("candCols.size: " + candCols.size());
//		        	System.out.println("lineList.size: " + lineList.size());
	        		List<CandidateTable> potentialTables = TableFinder.findNonRuledTables(candCols, lineList);
//	        		System.out.println("potentialTables.size: " + potentialTables.size());
	        		List<TextBlock> foundCells = new ArrayList<TextBlock>();
	        		
//	        		System.out.println("potentialTables:");
//	        		ListUtils.printList(potentialTables);
	        		
	        		processingResult.addAll(potentialTables);
	        		
	        		
	        		// NOW PROCESS THE TABLES THEMSELVES
	        		for (CandidateTable t : potentialTables)
	        		{
        				// after mergeoverlapping and prompter splitting
        				// it's necessary to sort the rows again
        				Collections.sort(t.getRows(), new YComparator());
        				
        				processingResult.addAll(t.getColumns());
//        				System.out.println("cols:");
        				ListUtils.printListWithSubItems(t.getColumns());
        				processingResult.addAll(t.getRows());
//        				System.out.println("rows:");
        				ListUtils.printListWithSubItems(t.getRows());
        				foundCells.addAll(t.getCells());
        				
        				OrderedTable tab = t.toOrderedTable();
        				
//        				System.out.println("tab: " + tab);
        				
        				processingResult.add(tab);
		            }
		            
	        		// added
	    	        
	    	        List<TextBlock> clustersToRemove = new ArrayList<TextBlock>();
	    	        List<TextBlock> remainingClusters = new ArrayList<TextBlock>();
	    	        remainingClusters.addAll(textBlocks);
	    	        for (TextBlock tc : foundCells)
	    	        {
    	        		for (TextBlock seg2 : remainingClusters)
        	    		{
    	    				if (SegmentUtils.horizIntersect(tc, seg2) &&
    	    					SegmentUtils.vertIntersect(tc, seg2))
    	    				{
    	    					clustersToRemove.add(seg2);
    	    				}
	    	        	}
	    	        }
	    	        
	    	        remainingClusters.removeAll(clustersToRemove);
	    	        processingResult.addAll(remainingClusters);
	    	       
	            }
		        
	        	if (processType == PP_COLUMN)
	        		processingResult.addAll(candCols);
		        
//		        processingResult.addAll(textBlocks);
//		        retVal.getItems().addAll(textLines);

//		        retVal.getItems().addAll(imageList);
//		        retVal.getItems().addAll(lineList);
//		        retVal.getItems().addAll(rectList);
//				RETVAL DOES NOTHING?
		        retVal.getItems().addAll(edgeSegmentList);
		        
		        Collections.sort(retVal.getItems(), new YComparator());//.reverseOrder(new YComparator()));
		        
//		        System.out.println("final output:");
//		        ListUtils.printList(processingResult);
		        
	        }
//        	if (Utils.DISPLAY_TIMINGS)
//	        	System.out.println("Time for specialized processing: " + (System.currentTimeMillis() - before));
        }
        
        // 2.04.09 commented out, as problem with derstandard (ruling lines were falsely detected)
        //if (rulingLines)
        //	theClusters = rop.splitBisectedSegments(theClusters);
    }

	public static List<Page> processDocPages(List<Page> thePages, int processType, boolean rulingLines, BufferedImage pageImage)
    {
    	// methods to run AFTER all pages have been understood
    	return new ArrayList<Page>();
    }
    
    protected void processPageImage()
    {
    	long startProcess = System.currentTimeMillis();
    	
    	Utils.IS_OCR = true; // relevance?
    	
		SegmentExtractor extractor = new SegmentExtractor();
//		pageImage = Utils.convertToBinary(pageImage); does not work!
		
		System.out.println("pageImage.type: " + pageImage.getColorModel());
		
		// static?
		int imgScale = pageImage.getWidth() / 600;
		
		BufferedImage procPageImage = extractor.preprocessImage(pageImage, imgScale);
		
		System.out.println("after pageImage.type: " + pageImage.getColorModel());
		
		if (Utils.DISPLAY_TIMINGS)
        	System.out.println("time B: " + (System.currentTimeMillis() - startProcess));
		
		// BINARIZE -- unnecessary?
		/*
		pageImage =
			new BufferedImage(oldImage.getWidth(),oldImage.getHeight(),
	        BufferedImage.TYPE_BYTE_BINARY);
	        // Get the graphics context for the black-and-white image.
	        Graphics2D g2d = pageImage.createGraphics();
	        // Render the input image on it.
	        g2d.drawImage(oldImage,0,0,null);
		*/
        
        extractor.slice(procPageImage);
        PDFPage thePage = new PDFPage();
        
        Float scaleFactor = Utils.PDF_POINT_RESOLUTION / Utils.XML_RESOLUTION;
        
        if (Utils.DISPLAY_TIMINGS)
        	System.out.println("time C: " + (System.currentTimeMillis() - startProcess));
        
        Iterator iter = extractor.getCharList().iterator();
        while(iter.hasNext())
        {
        	GenericSegment gs = (GenericSegment)iter.next();
        	gs.scaleCoordinates(scaleFactor * imgScale);
        	thePage.getItems().add(gs);
        }
        
        float[] bBox = {0, pageImage.getWidth(null), 0, pageImage.getHeight(null)};
        thePage.setBoundingBox(bBox);
        thePage.scaleCoordinates(scaleFactor);
        thePage.reverseYCoordinatesPNG();
        
        if (Utils.DISPLAY_TIMINGS)
        	System.out.println("time D: " + (System.currentTimeMillis() - startProcess));
        
        // lineFinder seems to do nothing ...
//        System.out.println("fragmentList.size: " + fragmentList.size());
//        COMMENT LINEFINDING OUT
//        REPLACE WITH PROPER SEGMENTATION ALGORITHM
        
        
        
        Collections.sort(thePage.getItems(), new XYTextComparatorOCR());
        
        fragList = new ArrayList<TextFragment>();
       	for (GenericSegment gs: thePage.getItems())
       		fragList.add((TextFragment)gs);
       	
        textLines =
//        	LineProcessor.findLines(thePage.getItems(), 0.75f, false, false);
        	LineProcessor.findLinesFromTextFragments(fragList, 0.75f, false, false);
        
       	
        
//        fragmentList.addAll(thePage.getItems());
        
//        System.out.println("added initialLines as text fragments: " + initialLines);
    }
    
    protected void callOCR()
    {
    	try 
		{
			//performOCR(theClusters, pageImage, thisPage.getBoundingBox());
			
			// 20.10.10 -- CV_Anonym.pdf.pdf problem
			// we just take the image from the dictionary and
			// therefore do not know its absolute size.
			// Therefore, we generate an arbitrary bounding box
			// by upscaling
			
			float width = pageImage.getWidth() * (Utils.PDF_POINT_RESOLUTION / Utils.XML_RESOLUTION);
			float height = pageImage.getHeight() * (Utils.PDF_POINT_RESOLUTION / Utils.XML_RESOLUTION);
			
			float[] bBox = new float[4];
			bBox[0] = 0; bBox[1] = width;
			bBox[2] = 0; bBox[3] = height;

			if (processType == PP_BLOCK)
				doOCR(textBlocks, pageImage, bBox);
			else
			{
				doOCR(textLines, pageImage, bBox);
				for (TextBlock tb : textBlocks)
					tb.findText();
			}
			// note: correct GUI preview is only guaranteed when image is not
			// shifted and scanned at 300dpi
//			if (Utils.DISPLAY_TIMINGS)
//	        	System.out.println("Time for OCR: " + (System.currentTimeMillis() - before));
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
    }
    
 // todo replace Image with a BufferedImage (required for cropping)
    protected static void doOCR(List<? extends TextSegment> theClusters, 
    	BufferedImage pageImage, float[] pageDimensions) throws IOException
    {
    	System.out.println("in performOCR with theClusters.size: " + theClusters.size());
    	
    	// output tif's cannot be read by tesseract if they are 2-bit
    	// (bug?)
    	
    	BufferedImage img = new BufferedImage(pageImage.getWidth(), pageImage.getHeight(), BufferedImage.TYPE_INT_RGB);
//    	BufferedImage img = new BufferedImage(pageImage.getWidth(), pageImage.getHeight(), BufferedImage.TYPE_BYTE_BINARY);
    	
    	/*
    	int multiplier = 1;
		if (pageImage.getColorModel().toString().indexOf("org.apache.pdfbox.pdmodel.graphics.color") >= 0) // -1 if not present
		{
			if (pageImage.getColorModel().toString().indexOf("#pixelBits = 32") >= 0)
				multiplier = 4;
			
			if (pageImage.getColorModel().toString().indexOf("#pixelBits = 24") >= 0)
				multiplier = 3;
			
			if (pageImage.getColorModel().toString().indexOf("#pixelBits = 16") >= 0)
				multiplier = 2;
			
		}
		
		System.out.println("multiplier: " + multiplier);
		

		
		for (int y = 0; y < pageImage.getHeight(); y ++)
        {
        	for (int x = 0; x < pageImage.getWidth(); x ++)
        	{
//        		System.out.println("h: " + pageImage.getHeight() + " w: " + pageImage.getWidth());
//        		System.out.println("x: " +x+" y: " +y);
//        		System.out.println("x2: " + (x / multiplier + (x % multiplier) * x));
//        		System.out.println("y2: " + (y / multiplier + (y % multiplier) * y));
        		
        		int xs = (x % multiplier) * (pageImage.getWidth() / multiplier);
        		int ys = (y % multiplier) * (pageImage.getHeight() / multiplier);
        		
//        		int xi =
//        		int yi =
        		
//        		int srcPixel = pageImage.getRGB(x / multiplier + (x % multiplier) * x, y / multiplier + (y % multiplier) * y);
//        		int srcPixel = pageImage.getRGB(xs + (x - xs), y / multiplier); // ys + (y - ys));
        		int srcPixel = pageImage.getRGB((x / multiplier) + xs, y / multiplier);
        		// grey image => r=g=b
//        		Color c = new Color(srcPixel);
        		img.setRGB(x, y, srcPixel);
        	}
        }
		*/
//		15.12.11 all the above malarky replaces this line
    	img.getGraphics().drawImage(pageImage,0,0,null);
    	
    	int origScaleFactor = img.getWidth() / 600;
    	
    	List<TextSegment> clustersToRemove = new ArrayList<TextSegment>();
    	for (TextSegment thisCluster : theClusters)
    	{
//    		SegmentList theLines = thisCluster.getItems();
//    		Iterator lineIter = theLines.iterator();
    		
    		PDFPage dummyPage = new PDFPage();
			dummyPage.setBoundingBox(pageDimensions);
			dummyPage.scaleCoordinates(Utils.XML_RESOLUTION / Utils.PDF_POINT_RESOLUTION);
			
//			14.11.10: changed to process whole blocks at a time
//    		while(lineIter.hasNext())
//    		{
//    			TextLine thisLine = (TextLine)lineIter.next();
    			
    			// retranslate coordinates
    			GenericSegment pixelCoords = new GenericSegment();
//    			pixelCoords.setBoundingBox(thisLine.getBoundingBox());
    			pixelCoords.setBoundingBox(thisCluster.getBoundingBox());
    			
    			// 14.12.10 grow Y coordinates to avoid cutting text!
//    			pixelCoords.setY1(pixelCoords.getY1() - 2);
//    			pixelCoords.setY2(pixelCoords.getY1() + 1);
    			
    			pixelCoords.scaleCoordinates(Utils.XML_RESOLUTION / Utils.PDF_POINT_RESOLUTION);
    			
    			List<GenericSegment> dummyList = new ArrayList<GenericSegment>();
    			dummyList.add(pixelCoords);
    			dummyPage.setItems(dummyList);
    			dummyPage.reverseYCoordinatesPNG();
    			pixelCoords = dummyPage.getItems().get(0);
    			
    			// cut from image and fill in with whitepixels if necessary
    			
    			// the following necessary to allow for rounding errors
    			// 16.12.10 adjusted for happy family indiv. lines
    			int subX = (int)pixelCoords.getX1() - (2 * origScaleFactor);
    			int subY = (int)pixelCoords.getY1() - (1 * origScaleFactor); // was 3, then 2
    			int subW = (int)pixelCoords.getWidth() + (4 * origScaleFactor);
    			int subH = (int)pixelCoords.getHeight() + (5 * origScaleFactor); // was 8, then 6
    			
    			// origScaleFactor stuff to enlarge the box slightly
    			
    			System.out.println("before subX: " + subX + " subY: " + subY + " subW: " + subW + " subH: " + subH);
    			
    			if (subX < 0) subX = 0;
    			if (subY < 0) subY = 0;
    			if (subX > img.getWidth()) subX = img.getWidth();
    			if (subY > img.getHeight()) subY = img.getHeight();
    			if ((subX + subW) > img.getWidth()) subW = img.getWidth() - subX - 1;
    			if ((subY + subH) > img.getHeight()) subH = img.getHeight() - subY - 1;
    			if (subW < 0) subW = 0;
    			if (subH < 0) subH = 0;
    			
    			System.out.println(img.getWidth());
    			System.out.println(img.getHeight());
    			
    			System.out.println("after subX: " + subX + " subY: " + subY + " subW: " + subW + " subH: " + subH);
    			
    			// todo if height or width > 0
    			
    			if (subW > 0 && subH > 0)
    			{
    			
	    			BufferedImage subImage = img.getSubimage(subX, subY, subW, subH);
	//    			BufferedImage subImage = 
	//    				img.getSubimage((int)pixelCoords.getX1(), (int)pixelCoords.getY1(), 
	//    					(int)pixelCoords.getSegWidth(), (int)pixelCoords.getSegHeight());
	    			
	    			// save image to disk
	    			File currentDir = new File(".");
	    			File outFile = new File(currentDir.getCanonicalPath()
							+ File.separator + "lineImg.tif");
	    			ImageIO.write(subImage, "tif", outFile);
	    			System.out.println("outFile: " + outFile.getCanonicalPath());
	    			
	    			// call tesseract from commandline
	    			
					String s = ("tesseract " + currentDir.getCanonicalPath()
//	    			String s = ("java -jar /home/tam/Downloads/VietOCR/VietOCR.jar " + currentDir.getCanonicalPath()
							+ File.separator + "lineImg.tif " + currentDir.getCanonicalPath()
							+ File.separator + "lineText -l deu");
	
					Utils.executeCommand(s, null, null);
	    			
	    			// set correct text
					String text = FileFunctions.readTextFile(currentDir.getCanonicalPath()
							+ File.separator + "lineText.txt").trim();
					
					// Replace newline characters with whitespace characters
					// thanks to: http://www.tek-tips.com/viewthread.cfm?qid=1256928&page=7
					text = text.replaceAll("\\n"," ");
					System.out.println("text found: " + text);
					
//					System.out.println("text: null? " + (text == null));
//					System.out.println("text: length? " + text.length());
					
//	    			thisLine.setSegText(text);
					if (text.length() == 0)
						clustersToRemove.add(thisCluster);
					else
						thisCluster.setText(text);
//    			}
    		}
    		// find text for cluster
//    		thisCluster.findText(false);
    	}
    	theClusters.removeAll(clustersToRemove);
    }

	public BufferedImage getPageImage() {
		return pageImage;
	}

	public void setPageImage(BufferedImage pageImage) {
		this.pageImage = pageImage;
	}

	public boolean isPerformOCR() {
		return performOCR;
	}

	public void setPerformOCR(boolean performOCR) {
		this.performOCR = performOCR;
	}

	public boolean isOCR() {
		return isOCR;
	}

	public void setOCR(boolean isOCR) {
		this.isOCR = isOCR;
	}
}
