/*
 * Decompiled with CFR 0.152.
 */
package net.sf.regain.crawler.preparator.html;

import java.util.ArrayList;
import net.sf.regain.RegainException;
import net.sf.regain.crawler.CrawlerToolkit;
import net.sf.regain.crawler.document.PathElement;
import net.sf.regain.crawler.document.RawDocument;
import net.sf.regain.crawler.preparator.html.AbstractExtractor;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;

public class HtmlPathExtractor
extends AbstractExtractor {
    private RE mPathNodeRE;
    private int mPathNodeUrlGroup;
    private int mPathNodeTitleGroup;

    public HtmlPathExtractor(String prefix, String pathStartRegex, String pathEndRegex, String pathNodeRegex, int pathNodeUrlGroup, int pathNodeTitleGroup) throws RegainException {
        super(prefix, pathStartRegex, pathEndRegex);
        try {
            this.mPathNodeRE = new RE(pathNodeRegex, 1);
        }
        catch (RESyntaxException exc) {
            throw new RegainException("Syntax error in regular expression", (Throwable)exc);
        }
        this.mPathNodeUrlGroup = pathNodeUrlGroup;
        this.mPathNodeTitleGroup = pathNodeTitleGroup;
    }

    public PathElement[] extractPath(RawDocument rawDocument) throws RegainException {
        String pathFragment = this.extractFragment(rawDocument);
        ArrayList<PathElement> list = new ArrayList<PathElement>();
        int offset = 0;
        while (this.mPathNodeRE.match(pathFragment, offset)) {
            String url = this.mPathNodeRE.getParen(this.mPathNodeUrlGroup);
            url = CrawlerToolkit.toAbsoluteUrl((String)url, (String)rawDocument.getUrl());
            String title = this.mPathNodeRE.getParen(this.mPathNodeTitleGroup);
            title = CrawlerToolkit.replaceHtmlEntities((String)title);
            list.add(new PathElement(url, title));
            offset = this.mPathNodeRE.getParenEnd(0);
        }
        if (list.isEmpty()) {
            return null;
        }
        PathElement[] asArr = new PathElement[list.size()];
        list.toArray(asArr);
        return asArr;
    }
}

