/*
 * Decompiled with CFR 0.152.
 */
package net.sf.regain.crawler.preparator.html;

import net.sf.regain.RegainException;
import net.sf.regain.crawler.document.RawDocument;
import org.apache.log4j.Logger;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;

public class AbstractExtractor {
    private static Logger mLog = Logger.getLogger(AbstractExtractor.class);
    private String mPrefix;
    private RE mFragmentStartRE;
    private String mFragmentStartRegex;
    private RE mFragmentEndRE;
    private String mFragmentEndRegex;

    public AbstractExtractor(String prefix, String fragmentStartRegex, String fragmentEndRegex) throws RegainException {
        this.mPrefix = prefix;
        try {
            if (fragmentStartRegex != null && fragmentStartRegex.length() != 0) {
                this.mFragmentStartRE = new RE(fragmentStartRegex, 1);
                this.mFragmentStartRegex = fragmentStartRegex;
            }
            if (fragmentEndRegex != null && fragmentEndRegex.length() != 0) {
                this.mFragmentEndRE = new RE(fragmentEndRegex, 1);
                this.mFragmentEndRegex = fragmentEndRegex;
            }
        }
        catch (RESyntaxException exc) {
            throw new RegainException("Syntax error in regular expression", (Throwable)exc);
        }
    }

    public boolean accepts(RawDocument rawDocument) {
        return rawDocument.getUrl().matches(this.mPrefix);
    }

    protected String extractFragment(RawDocument rawDocument) throws RegainException {
        String content = rawDocument.getContentAsString();
        int fragmentStart = 0;
        if (this.mFragmentStartRE != null) {
            if (this.mFragmentStartRE.match(content)) {
                fragmentStart = this.mFragmentStartRE.getParenEnd(0);
            } else {
                mLog.warn((Object)("The regular expression '" + this.mFragmentStartRegex + "' had no " + "match for '" + rawDocument.getUrl() + "'"));
            }
        }
        int fragmentEnd = content.length();
        if (this.mFragmentEndRE != null) {
            if (this.mFragmentEndRE.match(content, fragmentStart)) {
                fragmentEnd = this.mFragmentEndRE.getParenStart(0);
            } else {
                mLog.warn((Object)("The regular expression '" + this.mFragmentEndRegex + "' had no " + "match for '" + rawDocument.getUrl() + "'"));
            }
        }
        if (fragmentStart == 0 && fragmentEnd == content.length()) {
            return content;
        }
        return content.substring(fragmentStart, fragmentEnd);
    }
}

