/*
 * Decompiled with CFR 0.152.
 */
package org.semanticdesktop.aperture.extractor.word;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.HashSet;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.SavedByEntry;
import org.apache.poi.hwpf.model.SavedByTable;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.ontoware.rdf2go.model.Model;
import org.ontoware.rdf2go.model.node.Resource;
import org.ontoware.rdf2go.model.node.URI;
import org.ontoware.rdf2go.vocabulary.RDF;
import org.ontoware.rdf2go.vocabulary.XSD;
import org.semanticdesktop.aperture.extractor.Extractor;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.semanticdesktop.aperture.extractor.microsoft.util.PoiUtil;
import org.semanticdesktop.aperture.extractor.word.TmExtractorsWrapper;
import org.semanticdesktop.aperture.rdf.RDFContainer;
import org.semanticdesktop.aperture.rdf.util.ModelUtil;
import org.semanticdesktop.aperture.util.StringExtractor;
import org.semanticdesktop.aperture.vocabulary.APERTURE_NIE_EXTENSIONS;
import org.semanticdesktop.aperture.vocabulary.NCO;
import org.semanticdesktop.aperture.vocabulary.NFO;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WordExtractor
implements Extractor {
    private final Logger logger = LoggerFactory.getLogger(this.getClass());

    public void extract(URI id, InputStream stream, Charset charset, String mimeType, RDFContainer result) throws ExtractorException {
        this.extract(id, stream, null, charset, mimeType, result);
    }

    public void extract(URI id, InputStream stream, POIFSFileSystem fileSystem, Charset charset, String mimeType, RDFContainer result) throws ExtractorException {
        try {
            this.extractAll(stream, fileSystem, result);
        }
        catch (IOException e) {
            throw new ExtractorException(e);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void extractAll(InputStream stream, POIFSFileSystem fileSystem, RDFContainer container) throws ExtractorException, IOException {
        int bufferSize = PoiUtil.getBufferSize();
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream, bufferSize);
        }
        String text = null;
        URI uri = container.getDescribedUri();
        stream.mark(bufferSize);
        try {
            if (fileSystem == null) {
                fileSystem = new POIFSFileSystem((InputStream)new PoiUtil.NonCloseableStream(stream));
            }
            text = WordExtractor.tryPoiExtraction(fileSystem.getRoot(), fileSystem, container);
        }
        catch (EncryptedDocumentException e) {
            return;
        }
        catch (Exception e) {
            this.logger.debug("basic POI text extraction failed for " + container.getDescribedUri(), e);
        }
        finally {
            stream.reset();
        }
        if (this.tmExtractorsPresentOnClassPath()) {
            stream.mark(bufferSize);
            try {
                String tmText = this.tryTextminingOrgExtraction(stream, uri);
                if (tmText != null) {
                    if (text == null) {
                        text = tmText;
                    } else if (tmText.length() > text.length()) {
                        text = tmText;
                    } else if (WordExtractor.isGarbageText(text)) {
                        text = tmText;
                    }
                }
            }
            finally {
                stream.reset();
            }
        }
        if (text == null) {
            stream.mark(bufferSize);
            try {
                text = this.tryHeuristicPlaintextExtraction(stream, uri);
            }
            finally {
                stream.reset();
            }
        }
        if (text != null && (text = text.trim()).length() > 0) {
            container.add(NIE.plainTextContent, text);
        }
        container.add(RDF.type, NFO.PaginatedTextDocument);
    }

    private static boolean isGarbageText(String text) {
        for (int i = 0; i < text.length(); ++i) {
            if (!WordExtractor.isInitialTextChar(text.charAt(i))) continue;
            return false;
        }
        return true;
    }

    private static boolean isInitialTextChar(char c) {
        int charType = Character.getType(c);
        return charType == 1 || charType == 2 || charType == 3 || charType == 4 || charType == 5 || charType == 9 || charType == 12 || charType == 23 || charType == 20 || charType == 21 || charType == 22 || charType == 29 || charType == 30 || charType == 24;
    }

    public static String tryPoiExtraction(DirectoryNode dir, POIFSFileSystem fs, RDFContainer container) throws IOException, EncryptedDocumentException {
        WordExtractor.extractSaveHistory(dir, fs, container);
        PoiUtil.extractMetadata(dir, container);
        try {
            org.apache.poi.hwpf.extractor.WordExtractor extractor = new org.apache.poi.hwpf.extractor.WordExtractor(dir, fs);
            String text = extractor.getText();
            text = org.apache.poi.hwpf.extractor.WordExtractor.stripFields((String)text);
            return text;
        }
        catch (EncryptedDocumentException e) {
            container.add(NFO.encryptionStatus, NFO.encryptedStatus);
            throw e;
        }
    }

    private static void extractSaveHistory(DirectoryNode dir, POIFSFileSystem fs, RDFContainer container) throws IOException {
        try {
            HWPFDocument document = new HWPFDocument(dir, fs);
            WordExtractor.extractSaveHistory(document, container);
        }
        catch (IllegalStateException e) {
            // empty catch block
        }
    }

    private static void extractSaveHistory(HWPFDocument document, RDFContainer container) {
        SavedByTable table = document.getSavedByTable();
        if (table != null) {
            Model model = container.getModel();
            int index = 1;
            HashSet<String> contributors = new HashSet<String>();
            for (Object entryObject : table.getEntries()) {
                SavedByEntry entry = (SavedByEntry)entryObject;
                Resource entryRes = ModelUtil.generateRandomResource(model);
                Resource contactRes = ModelUtil.generateRandomResource(model);
                model.addStatement(entryRes, RDF.type, APERTURE_NIE_EXTENSIONS.SaveHistoryEntry);
                model.addStatement(container.getDescribedUri(), APERTURE_NIE_EXTENSIONS.hasSaveHistoryEntry, entryRes);
                model.addStatement(entryRes, APERTURE_NIE_EXTENSIONS.location, entry.getSaveLocation());
                model.addStatement(entryRes, APERTURE_NIE_EXTENSIONS.index, Integer.toString(index++), XSD._integer);
                model.addStatement(entryRes, APERTURE_NIE_EXTENSIONS.editor, contactRes);
                model.addStatement(contactRes, RDF.type, NCO.Contact);
                model.addStatement(contactRes, NCO.fullname, entry.getUserName());
                if (contributors.contains(entry.getUserName())) continue;
                contributors.add(entry.getUserName());
                model.addStatement(container.getDescribedUri(), NCO.contributor, contactRes);
            }
        }
    }

    private String tryTextminingOrgExtraction(InputStream stream, URI uri) {
        try {
            TmExtractorsWrapper wrapper = new TmExtractorsWrapper();
            return wrapper.getText(new PoiUtil.NonCloseableStream(stream));
        }
        catch (Exception e) {
            this.logger.debug("textmining.org extraction failed for " + uri, e);
            return null;
        }
    }

    private String tryHeuristicPlaintextExtraction(InputStream stream, URI uri) {
        try {
            StringExtractor extractor = new StringExtractor();
            return extractor.extract(stream);
        }
        catch (IOException e) {
            this.logger.warn("IOException while performing heuristic string extraction for " + uri, e);
            return null;
        }
    }

    private boolean tmExtractorsPresentOnClassPath() {
        try {
            Class<?> clazz = Class.forName("org.textmining.extraction.word.WordTextExtractorFactory");
            return clazz != null;
        }
        catch (ClassNotFoundException e) {
            return false;
        }
    }
}

