/*
 * Decompiled with CFR 0.152.
 */
package de.berlin.hu.uima.cr.xml;

import de.berlin.hu.types.PubmedDocument;
import de.berlin.hu.uima.cr.xml.XMLCollectionReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.u_compare.shared.semantic.NamedEntity;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class PatentCorpusCollectionReader
extends XMLCollectionReader {
    @Override
    protected List<File> getfiles(String inputDir) {
        ArrayList<File> result2 = new ArrayList<File>();
        File dir2 = new File(inputDir);
        File[] fileArray = dir2.listFiles();
        int n = fileArray.length;
        int n2 = 0;
        while (n2 < n) {
            File subdir = fileArray[n2];
            if (subdir.isDirectory()) {
                File[] fileArray2 = subdir.listFiles();
                int n3 = fileArray2.length;
                int n4 = 0;
                while (n4 < n3) {
                    File file = fileArray2[n4];
                    if (file.isFile() && "scrapbook.xml".equals(file.getName())) {
                        result2.add(file);
                        break;
                    }
                    ++n4;
                }
            }
            ++n2;
        }
        return result2;
    }

    @Override
    public void getNext(CAS aCAS) throws IOException, CollectionException {
        Document document2 = this.getNextDocument();
        JCas jcas = null;
        try {
            jcas = aCAS.getJCas();
        }
        catch (CASException e) {
            throw new CollectionException(e);
        }
        NodeList snippetlist = document2.getElementsByTagName("snippet");
        ArrayList<Node> snippetNodes = new ArrayList<Node>();
        String text2 = "";
        int i = 0;
        while (i < snippetlist.getLength()) {
            Node node = snippetlist.item(i);
            text2 = String.valueOf(text2) + node.getTextContent().trim() + "\n\n";
            snippetNodes.add(node);
            ++i;
        }
        jcas.setDocumentText(text2);
        SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
        srcDocInfo.setUri(document2.getDocumentURI().replaceFirst("(/|\\\\)[^/\\\\]+$", ""));
        srcDocInfo.setOffsetInSource(0);
        srcDocInfo.setDocumentSize(text2.length());
        srcDocInfo.setLastSegment(this.hasNext());
        srcDocInfo.addToIndexes();
        PubmedDocument abstractAnnotation = new PubmedDocument(jcas);
        abstractAnnotation.setBegin(0);
        abstractAnnotation.setEnd(text2.length());
        abstractAnnotation.setPmid("");
        abstractAnnotation.addToIndexes(jcas);
        int offset = 0;
        for (Node snippetNode : snippetNodes) {
            NodeList childNodes = snippetNode.getChildNodes();
            int i2 = 0;
            while (i2 < childNodes.getLength()) {
                Node node = childNodes.item(i2);
                if ("ne".equals(node.getNodeName()) && !node.getTextContent().isEmpty()) {
                    String chemical = node.getTextContent();
                    Node chebiIdNode = node.getAttributes().getNamedItem("chebi-id");
                    if (chebiIdNode != null) {
                        String chebiID = chebiIdNode.getTextContent().replaceAll("W[^:]+:", "");
                        Matcher matcher = Pattern.compile(Pattern.quote(chemical)).matcher(text2.substring(offset));
                        if (matcher.find()) {
                            int begin = matcher.start();
                            int end = matcher.end();
                            NamedEntity namedEntity = new NamedEntity(jcas);
                            namedEntity.setBegin(offset + begin);
                            namedEntity.setEnd(offset + end);
                            namedEntity.setConfidence(1.0);
                            namedEntity.setId("," + chebiID);
                            namedEntity.setSource("goldstandard");
                            namedEntity.addToIndexes();
                            offset = end + 1;
                        }
                    }
                }
                ++i2;
            }
        }
    }
}

