/*
 * Decompiled with CFR 0.152.
 */
package org.semanticdesktop.aperture.crawler.web;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.ontoware.aifbcommons.collection.ClosableIterator;
import org.ontoware.rdf2go.exception.ModelRuntimeException;
import org.ontoware.rdf2go.model.Model;
import org.ontoware.rdf2go.model.node.URI;
import org.ontoware.rdf2go.model.node.impl.URIImpl;
import org.ontoware.rdf2go.vocabulary.RDF;
import org.semanticdesktop.aperture.accessor.AccessData;
import org.semanticdesktop.aperture.accessor.DataAccessor;
import org.semanticdesktop.aperture.accessor.DataAccessorFactory;
import org.semanticdesktop.aperture.accessor.DataObject;
import org.semanticdesktop.aperture.accessor.FileDataObject;
import org.semanticdesktop.aperture.accessor.RDFContainerFactory;
import org.semanticdesktop.aperture.accessor.UrlNotFoundException;
import org.semanticdesktop.aperture.accessor.base.FilterAccessData;
import org.semanticdesktop.aperture.crawler.ExitCode;
import org.semanticdesktop.aperture.crawler.base.CrawlerBase;
import org.semanticdesktop.aperture.crawler.web.CrawlJob;
import org.semanticdesktop.aperture.datasource.config.DomainBoundaries;
import org.semanticdesktop.aperture.datasource.web.WebDataSource;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractor;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractorFactory;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractorRegistry;
import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier;
import org.semanticdesktop.aperture.util.IOUtil;
import org.semanticdesktop.aperture.util.UrlUtil;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class WebCrawler
extends CrawlerBase {
    private Logger logger = LoggerFactory.getLogger(this.getClass());
    private MimeTypeIdentifier mimeTypeIdentifier;
    private LinkExtractorRegistry linkExtractorRegistry;
    private long maxByteSize;
    private Boolean includeEmbeddedResources;
    private DomainBoundaries domainBoundaries;
    private LinkedList<CrawlJob> jobsQueue;
    private HashMap<String, CrawlJob> jobsMap;
    private HashSet<String> crawledUrls;
    private int initialDepth;
    private WebAccessData wad = null;

    public void setMimeTypeIdentifier(MimeTypeIdentifier mimeTypeIdentifier) {
        this.mimeTypeIdentifier = mimeTypeIdentifier;
    }

    public MimeTypeIdentifier getMimeTypeIdentifier() {
        return this.mimeTypeIdentifier;
    }

    public void setLinkExtractorRegistry(LinkExtractorRegistry linkExtractorRegistry) {
        this.linkExtractorRegistry = linkExtractorRegistry;
    }

    public LinkExtractorRegistry getLinkExtractorRegistry() {
        return this.linkExtractorRegistry;
    }

    @Override
    protected ExitCode crawlObjects() {
        this.initialize();
        this.processQueue();
        this.removeDeprecatedRedirections();
        boolean completed = this.jobsQueue.isEmpty();
        this.cleanUp();
        return completed ? ExitCode.COMPLETED : ExitCode.STOP_REQUESTED;
    }

    private void initialize() {
        int crawlDepth;
        if (this.mimeTypeIdentifier == null) {
            throw new IllegalArgumentException("MimeTypeIdentifier missing");
        }
        if (this.linkExtractorRegistry == null) {
            throw new IllegalArgumentException("LinkExtractorRegistry missing");
        }
        this.jobsQueue = new LinkedList();
        this.jobsMap = new HashMap(1024);
        if (this.accessData == null) {
            this.crawledUrls = new HashSet(1024);
        } else {
            this.wad = new WebAccessData(this.accessData);
        }
        WebDataSource source = (WebDataSource)this.getDataSource();
        String startUrl = source.getRootUrl();
        this.domainBoundaries = source.getDomainBoundaries();
        this.includeEmbeddedResources = source.getIncludeEmbeddedResources();
        Integer integer = source.getMaximumDepth();
        this.initialDepth = crawlDepth = integer == null ? Integer.MAX_VALUE : integer;
        Long l = source.getMaximumSize();
        this.maxByteSize = l == null ? Long.MAX_VALUE : l;
        this.schedule(startUrl, crawlDepth, false);
    }

    private void schedule(String url, int crawlDepth, boolean checkDomain) {
        if (url == null) {
            return;
        }
        if ((url = this.normalizeAndFixURL(url, null).string) == null) {
            return;
        }
        if (this.isCrawled(url)) {
            return;
        }
        if (checkDomain && !this.domainBoundaries.inDomain(url)) {
            return;
        }
        CrawlJob job = this.jobsMap.get(url);
        if (job == null) {
            job = new CrawlJob(url, crawlDepth);
            this.jobsMap.put(url, job);
        } else {
            if (job.getDepth() >= crawlDepth) {
                return;
            }
            job.setDepth(crawlDepth);
            this.jobsQueue.remove(job);
        }
        ListIterator<CrawlJob> iterator = this.jobsQueue.listIterator(this.jobsQueue.size());
        while (iterator.hasPrevious()) {
            CrawlJob scheduledJob = iterator.previous();
            if (scheduledJob.getDepth() < crawlDepth) continue;
            iterator.next();
            break;
        }
        iterator.add(job);
    }

    private boolean isCrawled(String url) {
        if (this.wad != null) {
            return this.wad.isTouched(url);
        }
        return this.crawledUrls.contains(url);
    }

    private void addCrawled(String url) {
        if (this.wad != null) {
            this.wad.touch(url);
        } else {
            this.crawledUrls.add(url);
        }
    }

    private void processQueue() {
        while (!this.jobsQueue.isEmpty() && !this.isStopRequested()) {
            CrawlJob job = this.jobsQueue.removeFirst();
            String url = job.getURL();
            int depth = job.getDepth();
            if (this.logger.isDebugEnabled()) {
                try {
                    java.net.URI uri = new java.net.URI(url);
                }
                catch (URISyntaxException use) {
                    this.logger.debug("Faulty url: " + url);
                }
            }
            this.reportAccessingObject(url);
            boolean knownUrl = this.accessData == null ? false : this.accessData.isKnownId(url);
            this.addCrawled(url);
            this.jobsMap.remove(url);
            DataAccessor accessor = this.getDataAccessor(url);
            if (accessor == null) continue;
            try {
                String finalUrl;
                RDFContainerFactory containerFactory = this.getRDFContainerFactory(url);
                DataObject dataObject = accessor.getDataObjectIfModified(url, this.source, this.wad, null, containerFactory);
                if (dataObject == null) {
                    this.reportUnmodifiedDataObject(url);
                    String urlRedirectsTo = this.wad.get(url, "redirectsTo");
                    if (urlRedirectsTo != null) {
                        this.addCrawled(urlRedirectsTo);
                    }
                    if (depth <= 0) continue;
                    this.scheduleCachedLinks(url, depth - 1);
                    continue;
                }
                if (depth == this.initialDepth) {
                    dataObject.getMetadata().add(NIE.rootElementOf, this.source.getID());
                }
                if (!(finalUrl = dataObject.getID().toString()).equals(url)) {
                    CrawlJob redundantJob = this.jobsMap.remove(finalUrl);
                    if (redundantJob != null) {
                        this.jobsQueue.remove(redundantJob);
                    }
                    if (this.isCrawled(finalUrl)) {
                        dataObject.dispose();
                        continue;
                    }
                    this.addCrawled(finalUrl);
                }
                if (this.hasAcceptableByteSize(dataObject)) {
                    if (dataObject instanceof FileDataObject) {
                        this.processLinks((FileDataObject)dataObject, depth - 1);
                    }
                    if (knownUrl) {
                        this.reportModifiedDataObject(dataObject);
                        continue;
                    }
                    this.reportNewDataObject(dataObject);
                    continue;
                }
                this.unregisterUrl(url, knownUrl);
                dataObject.dispose();
            }
            catch (UrlNotFoundException e) {
                this.unregisterUrl(url, knownUrl);
            }
            catch (IOException e) {
                this.logger.info("I/O error while accessing " + url, e);
            }
            catch (Exception e) {
                this.logger.info("Error while accessing " + url, e);
            }
        }
    }

    private boolean hasAcceptableByteSize(DataObject dataObject) {
        if (this.maxByteSize == Long.MAX_VALUE) {
            return true;
        }
        Long l = dataObject.getMetadata().getLong(NIE.byteSize);
        return l == null ? true : l <= this.maxByteSize;
    }

    private void unregisterUrl(String url, boolean knownUrl) {
        if (knownUrl) {
            this.reportDeletedDataObject(url);
        } else if (this.accessData != null) {
            this.accessData.remove(url);
        }
    }

    private DataAccessor getDataAccessor(String url) {
        int index = url.indexOf(58);
        if (index <= 0) {
            return null;
        }
        String scheme = url.substring(0, index);
        Set factories = this.accessorRegistry.get(scheme);
        if (factories.isEmpty()) {
            return null;
        }
        DataAccessorFactory factory = (DataAccessorFactory)factories.iterator().next();
        return factory.get();
    }

    private void scheduleCachedLinks(String url, int depth) {
        if (this.accessData == null) {
            this.logger.error("Internal error: scheduling cached links for unmodified url while no AccessData is set: " + url);
        } else {
            Set links;
            String redirectedUrl = this.accessData.get(url, "redirectsTo");
            if (redirectedUrl != null) {
                url = redirectedUrl;
            }
            if ((links = this.accessData.getReferredIDs(url)) != null) {
                for (String link : links) {
                    this.schedule(link, depth, true);
                }
            }
        }
    }

    private void processLinks(FileDataObject object, int depth) {
        InputStream content;
        String url = object.getID().toString();
        if (this.accessData != null) {
            this.accessData.removeReferredIDs(url);
        }
        if ((content = this.getMarkSupportingContent(object)) == null) {
            return;
        }
        String mimeType = this.getMimeType(content, object);
        if (mimeType == null) {
            return;
        }
        LinkExtractor extractor = this.getLinkExtractor(mimeType);
        if (extractor == null) {
            return;
        }
        if ((content = this.getByteArrayContent(content, object)) == null) {
            return;
        }
        List<String> links = this.getLinks(content, extractor, url);
        if (links == null) {
            return;
        }
        HashSet<String> scheduledLinks = new HashSet<String>(links.size());
        for (String link : links) {
            StringUriPair pair = this.normalizeAndFixURL(link, object.getMetadata().getModel());
            link = pair.string;
            URI linkedResourceUri = pair.uri;
            if (link == null || url.equals(link) || scheduledLinks.contains(link)) continue;
            if (depth >= 0) {
                if (link != null) {
                    this.schedule(link, depth, true);
                    if (linkedResourceUri != null) {
                        object.getMetadata().add(NIE.links, linkedResourceUri);
                        object.getMetadata().getModel().addStatement(linkedResourceUri, RDF.type, NIE.DataObject);
                        scheduledLinks.add(link);
                    }
                } else {
                    this.logger.warn("WebCrawler is skipping link {}", (Object)link);
                    continue;
                }
            }
            if (this.accessData == null) continue;
            this.accessData.putReferredID(url, link);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private List<String> getLinks(InputStream content, LinkExtractor extractor, String url) {
        try {
            content.mark(Integer.MAX_VALUE);
            HashMap<Object, Object> params = new HashMap<Object, Object>();
            params.put(LinkExtractor.BASE_URL_KEY, url);
            if (this.includeEmbeddedResources != null) {
                params.put(LinkExtractor.INCLUDE_EMBEDDED_RESOURCES_KEY, this.includeEmbeddedResources);
            }
            List list = extractor.extractLinks(content, params);
            return list;
        }
        catch (Exception e) {
            this.logger.info("IOException while extracting links", e);
        }
        finally {
            try {
                content.reset();
            }
            catch (IOException e) {
                this.logger.warn("internal error: IOException while resetting a ByteArrayInputStream", e);
            }
        }
        return null;
    }

    private InputStream getMarkSupportingContent(FileDataObject object) {
        try {
            InputStream content = null;
            content = object.getContent();
            if (!content.markSupported()) {
                content = new BufferedInputStream(content);
            }
            return content;
        }
        catch (IOException ioe) {
            this.logger.info("IOException while obtaining the object content", ioe);
            object.setContent(null);
            return null;
        }
    }

    private InputStream getByteArrayContent(InputStream content, FileDataObject object) {
        if (!(content instanceof ByteArrayInputStream)) {
            try {
                content = new ByteArrayInputStream(IOUtil.readBytes(content));
            }
            catch (IOException e) {
                this.logger.warn("IOException while buffering document", e);
                object.setContent(null);
                return null;
            }
            object.setContent(content);
            return content;
        }
        return content;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private String getMimeType(InputStream content, FileDataObject object) {
        String mimeType = null;
        try {
            int bufferSize = this.mimeTypeIdentifier.getMinArrayLength();
            content.mark(bufferSize);
            try {
                byte[] magicBytes = IOUtil.readBytes(content, bufferSize);
                mimeType = this.mimeTypeIdentifier.identify(magicBytes, null, object.getID());
            }
            finally {
                content.reset();
            }
        }
        catch (IOException ioe) {
            this.logger.debug("IOError while determining the mime type", ioe);
            try {
                content.close();
            }
            catch (Exception e) {
                // empty catch block
            }
            object.setContent(null);
        }
        if (mimeType == null) {
            mimeType = object.getMetadata().getString(NIE.mimeType);
        } else {
            object.getMetadata().put(NIE.mimeType, mimeType);
        }
        return mimeType;
    }

    private LinkExtractor getLinkExtractor(String mimeType) {
        Set factories = this.linkExtractorRegistry.get(mimeType);
        if (!factories.isEmpty()) {
            LinkExtractorFactory factory = (LinkExtractorFactory)factories.iterator().next();
            return factory.get();
        }
        return null;
    }

    private StringUriPair normalizeAndFixURL(String url, Model model) {
        String resultUrl = url;
        if (url.startsWith("file:") || url.startsWith("http:") || url.startsWith("https:")) {
            try {
                String externalForm;
                URL parsedUrl = new URL(url);
                resultUrl = externalForm = UrlUtil.normalizeURL(parsedUrl).toExternalForm();
            }
            catch (MalformedURLException e) {
                return new StringUriPair(null, null);
            }
        }
        URI resultUri = null;
        try {
            resultUri = model != null ? model.createURI(resultUrl) : new URIImpl(resultUrl);
        }
        catch (IllegalArgumentException iae) {
            try {
                if (resultUrl.startsWith("file:") || resultUrl.startsWith("http:") || resultUrl.startsWith("https:")) {
                    try {
                        URL parsedLink = new URL(resultUrl);
                        java.net.URI parsedUri = new java.net.URI(parsedLink.getProtocol(), parsedLink.getAuthority(), parsedLink.getPath(), parsedLink.getQuery(), parsedLink.getRef());
                        resultUrl = parsedUri.toString();
                        resultUri = model.createURI(resultUrl);
                    }
                    catch (MalformedURLException mfe) {
                        resultUrl = null;
                        resultUri = null;
                    }
                    catch (URISyntaxException e) {
                        resultUrl = null;
                        resultUri = null;
                    }
                } else {
                    resultUrl = null;
                    resultUri = null;
                }
            }
            catch (ModelRuntimeException e) {
                this.logger.debug("Unable to create URI for link {}", (Object)resultUrl);
                resultUrl = null;
                resultUri = null;
            }
        }
        return new StringUriPair(resultUrl, resultUri);
    }

    private void removeDeprecatedRedirections() {
        if (this.accessData != null) {
            HashSet<String> deprecatedRedirections = new HashSet<String>();
            ClosableIterator iter = this.accessData.getUntouchedIDsIterator();
            while (iter.hasNext()) {
                String url = iter.next().toString();
                if (this.accessData.get(url, "redirectsTo") == null) continue;
                deprecatedRedirections.add(url);
            }
            for (String dep : deprecatedRedirections) {
                this.accessData.touch(dep);
                this.accessData.remove(dep, "redirectsTo");
            }
        }
    }

    private void cleanUp() {
        this.domainBoundaries = null;
        this.jobsQueue = null;
        this.jobsMap = null;
        this.crawledUrls = null;
        this.includeEmbeddedResources = null;
    }

    private class WebAccessData
    extends FilterAccessData {
        public WebAccessData(AccessData accessData) {
            super(accessData);
        }

        public void put(String id, String key, String value) {
            if ("redirectsTo".equals(key)) {
                this.touch(id);
                CrawlJob job = (CrawlJob)WebCrawler.this.jobsMap.remove(id);
                if (job != null) {
                    WebCrawler.this.jobsQueue.remove(job);
                }
            }
            super.put(id, key, value);
        }
    }

    private static class StringUriPair {
        private String string;
        private URI uri;

        public StringUriPair(String string, URI uri) {
            this.string = string;
            this.uri = uri;
        }
    }
}

