/*
 * Decompiled with CFR 0.152.
 */
package net.sf.regain.crawler.preparator.html;

import net.sf.regain.RegainException;
import net.sf.regain.crawler.document.RawDocument;
import net.sf.regain.crawler.preparator.html.AbstractExtractor;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;

public class HtmlContentExtractor
extends AbstractExtractor {
    private RE mHeadlineRE;
    private int mHeadlineRegexGroup = -1;

    public HtmlContentExtractor(String prefix, String contentStartRegex, String contentEndRegex, String headlineRegex, int headlineRegexGroup) throws RegainException {
        super(prefix, contentStartRegex, contentEndRegex);
        try {
            if (headlineRegex != null && headlineRegex.length() != 0) {
                this.mHeadlineRE = new RE(headlineRegex, 3);
                this.mHeadlineRegexGroup = headlineRegexGroup;
            }
        }
        catch (RESyntaxException exc) {
            throw new RegainException("Syntax error in regular expression", (Throwable)exc);
        }
    }

    public String extractContent(RawDocument rawDocument) throws RegainException {
        return this.extractFragment(rawDocument);
    }

    public String extractHeadlines(String content) {
        if (this.mHeadlineRE == null) {
            return null;
        }
        int offset = 0;
        StringBuffer buffer = null;
        while (this.mHeadlineRE.match(content, offset)) {
            String headline = this.mHeadlineRE.getParen(this.mHeadlineRegexGroup);
            if ((headline = headline.trim()).length() != 0) {
                if (buffer == null) {
                    buffer = new StringBuffer();
                }
                buffer.append(headline);
                buffer.append("\n");
            }
            offset = this.mHeadlineRE.getParenEnd(0);
        }
        if (buffer == null) {
            return null;
        }
        return buffer.toString();
    }
}

