/*
 * Decompiled with CFR 0.152.
 */
package de.berlin.hu.uima.cr.iob;

import de.berlin.hu.types.PubmedDocument;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Scanner;
import java.util.zip.GZIPInputStream;
import org.apache.uima.UIMAFramework;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.impl.XCASDeserializer;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.InvalidXMLException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.apache.uima.util.XMLInputSource;
import org.u_compare.shared.semantic.NamedEntity;
import org.u_compare.shared.syntactic.Token;
import org.xml.sax.SAXException;

public class IOBDirectoryCollectionReader
extends CollectionReader_ImplBase {
    public static final String PARAM_INPUTDIR = "InputDirectory";
    public static final String PARAM_ENCODING = "Encoding";
    public static final String PARAM_LANGUAGE = "Language";
    public static final String PARAM_XCAS = "XCAS";
    public static final String PARAM_LENIENT = "LENIENT";
    private ArrayList mFiles;
    private String mEncoding;
    private String mLanguage;
    private int mCurrentIndex;
    private boolean mTEXT;
    private String mXCAS;
    private boolean lenient;
    private static final String I_TYPE = "|I-";
    private static final String B_TYPE = "|B-";
    private static final String ENTITY_TYPE = "IUPAC";
    public static final String PARAM_USE_GOLDSTANDARD_ANNOTATIONS = "UseGoldstandardAnnotations";
    public static final String PARAM_GOLDSTANDARD_TYPE_SUFFIX = "GoldstandardTypeSuffix";
    public static final String PARAM_INCLUDE_SUFFIXES = "IncludeSuffixes";
    private int currentDocument = 0;
    private int numDocuments = 0;
    private Map<String, Integer> typeCounts = new HashMap<String, Integer>();
    private int numberOfTokens;
    private boolean useGoldstandardAnnotations;
    private String goldstandardTypeSuffix;
    private int numberOfTokensInEntity;
    private StringBuffer docText;
    private int numberOfEntities;
    private int numberOfShortEntities;
    private HashSet<String> includeSuffixes;
    private final boolean provideGoldstandardTokenization = false;

    @Override
    public void initialize() throws ResourceInitializationException {
        String dirPath = ((String)this.getConfigParameterValue(PARAM_INPUTDIR)).trim();
        File directory = new File(dirPath);
        this.mEncoding = (String)this.getConfigParameterValue(PARAM_ENCODING);
        this.mLanguage = (String)this.getConfigParameterValue(PARAM_LANGUAGE);
        this.mXCAS = (String)this.getConfigParameterValue(PARAM_XCAS);
        this.mTEXT = !"xcas".equalsIgnoreCase(this.mXCAS) && !"xmi".equalsIgnoreCase(this.mXCAS) && !"true".equalsIgnoreCase(this.mXCAS);
        String mLenient = (String)this.getConfigParameterValue(PARAM_LENIENT);
        this.lenient = "true".equalsIgnoreCase(mLenient);
        this.mCurrentIndex = 0;
        this.includeSuffixes = new HashSet<String>(Arrays.asList((String[])this.getConfigParameterValue(PARAM_INCLUDE_SUFFIXES)));
        if (!directory.exists()) {
            throw new ResourceInitializationException("directory_not_found", new Object[]{PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath()});
        }
        this.mFiles = new ArrayList();
        if (directory.isDirectory()) {
            File[] files = directory.listFiles();
            int i = 0;
            while (i < files.length) {
                boolean validSuffix = false;
                for (String suffix : this.includeSuffixes) {
                    if (!files[i].getAbsolutePath().endsWith(suffix)) continue;
                    validSuffix = true;
                }
                if (!files[i].isDirectory() && validSuffix) {
                    this.mFiles.add(files[i]);
                }
                ++i;
            }
        } else {
            this.mFiles.add(directory);
        }
        this.useGoldstandardAnnotations = (Boolean)this.getConfigParameterValue(PARAM_USE_GOLDSTANDARD_ANNOTATIONS);
        this.goldstandardTypeSuffix = (String)this.getConfigParameterValue(PARAM_GOLDSTANDARD_TYPE_SUFFIX);
        System.err.println("CR initialization complete. # files to process: " + this.mFiles.size());
    }

    @Override
    public boolean hasNext() {
        return this.mCurrentIndex < this.mFiles.size();
    }

    @Override
    public void getNext(CAS aCAS) throws IOException, CollectionException {
        JCas jcas;
        try {
            jcas = aCAS.getJCas();
        }
        catch (CASException e) {
            throw new CollectionException(e);
        }
        File file = (File)this.mFiles.get(this.mCurrentIndex++);
        FileInputStream fis = new FileInputStream(file);
        if (this.mTEXT) {
            try {
                if (this.getCasInitializer() != null) {
                    this.getCasInitializer().initializeCas(fis, aCAS);
                } else {
                    String text2 = "";
                    if (file.getAbsolutePath().endsWith("gz")) {
                        GZIPInputStream is = new GZIPInputStream(fis);
                        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(is));
                        StringBuilder sb = new StringBuilder();
                        String line = null;
                        while ((line = bufferedReader.readLine()) != null) {
                            sb.append(String.valueOf(line) + "\n");
                        }
                        bufferedReader.close();
                        text2 = sb.toString();
                    } else {
                        text2 = FileUtils.file2String(file, this.mEncoding);
                    }
                    this.buildCASFromIOBCorpus(jcas, text2);
                }
            }
            finally {
                if (fis != null) {
                    fis.close();
                }
            }
            if (this.mLanguage != null) {
                jcas.setDocumentLanguage(this.mLanguage);
            }
            SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
            srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
            srcDocInfo.setOffsetInSource(0);
            srcDocInfo.setDocumentSize((int)file.length());
            srcDocInfo.setLastSegment(this.mCurrentIndex == this.mFiles.size());
            srcDocInfo.addToIndexes();
        } else {
            try {
                try {
                    if (this.mXCAS.equalsIgnoreCase("xmi")) {
                        XmiCasDeserializer.deserialize(fis, aCAS, this.lenient);
                    } else {
                        XCASDeserializer.deserialize(fis, aCAS, this.lenient);
                    }
                }
                catch (SAXException e) {
                    UIMAFramework.getLogger(IOBDirectoryCollectionReader.class).log(Level.WARNING, "Problem with XML input file: " + file.getAbsolutePath());
                    throw new CollectionException(e);
                }
            }
            finally {
                fis.close();
            }
        }
    }

    @Override
    public void close() throws IOException {
    }

    @Override
    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.mCurrentIndex, this.mFiles.size(), "entities")};
    }

    public int getNumberOfDocuments() {
        return this.mFiles.size();
    }

    public static CollectionReaderDescription getDescription() throws InvalidXMLException {
        InputStream descStream = IOBDirectoryCollectionReader.class.getResourceAsStream("IOBDirectoryCollectionReader.xml");
        return UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(descStream, null));
    }

    public static URL getDescriptorURL() {
        return IOBDirectoryCollectionReader.class.getResource("IOBDirectoryCollectionReader.xml");
    }

    private void buildCASFromIOBCorpus(JCas jcas, String iobText) {
        PubmedDocument abstractAnnotation;
        Scanner lineScanner = new Scanner(iobText);
        lineScanner.useDelimiter("\n");
        int endOfCurrentToken = 0;
        this.docText = new StringBuffer();
        int beginOfCurrentEntity = -1;
        String currentLine = lineScanner.next();
        String nextLine = null;
        int beginOfCurrentAbstract = 0;
        boolean firstLine = true;
        boolean newAbstract = false;
        boolean currentTokenIsPMID = false;
        String currentPMID = "";
        int endOfLastTokenInCurrentAbstract = 0;
        while (currentLine != null) {
            nextLine = lineScanner.hasNext() ? lineScanner.next() : null;
            if (currentLine.startsWith("###")) {
                if (firstLine) {
                    firstLine = false;
                    currentPMID = currentLine.substring(4);
                } else {
                    this.docText.append("\n");
                    newAbstract = true;
                    if (beginOfCurrentAbstract != endOfCurrentToken) {
                        abstractAnnotation = new PubmedDocument(jcas);
                        abstractAnnotation.setBegin(beginOfCurrentAbstract);
                        abstractAnnotation.setEnd(endOfCurrentToken);
                        abstractAnnotation.setPmid(currentPMID);
                        abstractAnnotation.addToIndexes(jcas);
                    }
                    beginOfCurrentAbstract = this.docText.length();
                    currentPMID = currentLine.substring(4);
                }
            } else if (!currentLine.trim().isEmpty()) {
                String[] columns = currentLine.split("\t");
                if (columns.length > 0) {
                    ++this.numberOfTokens;
                    String currentTokenText = columns[0];
                    if (newAbstract) {
                        newAbstract = false;
                        if (currentTokenText.matches("[0-9]+") && currentTokenText.length() > 4) {
                            currentTokenText = currentTokenText.replaceAll("[0-9]", " ");
                            currentTokenIsPMID = true;
                        }
                    }
                    int beginOfCurrentTokenInCurrentAbstract = Integer.valueOf(columns[1]);
                    int endOfCurrentTokenInCurrentAbstract = Integer.valueOf(columns[2]);
                    String currentTokenType = columns[columns.length - 1];
                    int numberOfWhiteSpaces = beginOfCurrentTokenInCurrentAbstract - endOfLastTokenInCurrentAbstract;
                    endOfLastTokenInCurrentAbstract = endOfCurrentTokenInCurrentAbstract;
                    this.docText.append(IOBDirectoryCollectionReader.insertNWhitespaces(numberOfWhiteSpaces));
                    int beginOfCurrentToken = this.docText.length();
                    this.docText.append(currentTokenText);
                    endOfCurrentToken = this.docText.length();
                    if (columns.length <= 5) {
                        if (currentTokenType.startsWith(B_TYPE)) {
                            ++this.numberOfTokensInEntity;
                            beginOfCurrentEntity = beginOfCurrentToken;
                            if (nextLine == null || !nextLine.matches(".*\\t\\|I-.*")) {
                                this.generateNewNamedEntityAnnotation(beginOfCurrentEntity, endOfCurrentToken, currentTokenType, jcas);
                            }
                        } else if (currentTokenType.startsWith(I_TYPE)) {
                            ++this.numberOfTokensInEntity;
                            if (nextLine == null || !nextLine.matches(".*\\t\\|I-.*")) {
                                this.generateNewNamedEntityAnnotation(beginOfCurrentEntity, endOfCurrentToken, currentTokenType, jcas);
                            }
                        }
                    } else {
                        System.err.println("Wrong corpus format: " + currentLine);
                    }
                }
                currentTokenIsPMID = false;
            }
            currentLine = nextLine;
        }
        if (beginOfCurrentAbstract != endOfCurrentToken) {
            abstractAnnotation = new PubmedDocument(jcas);
            abstractAnnotation.setBegin(beginOfCurrentAbstract);
            abstractAnnotation.setEnd(endOfCurrentToken);
            abstractAnnotation.setPmid(currentPMID);
            abstractAnnotation.addToIndexes(jcas);
        }
        jcas.setDocumentText(this.docText.toString());
    }

    private static String insertNWhitespaces(int n) {
        String whitespaces = "";
        int i = 0;
        while (i < n) {
            whitespaces = String.valueOf(whitespaces) + " ";
            ++i;
        }
        return whitespaces;
    }

    private void generateNewTokenAnnotation(int begin, int end, JCas jcas) {
        Token token2 = new Token(jcas);
        token2.setBegin(begin);
        token2.setEnd(end);
        token2.addToIndexes();
    }

    private void generateNewNamedEntityAnnotation(int begin, int end, String type, JCas jcas) {
        String typeString = type.split("-")[1];
        if (this.typeCounts.containsKey(typeString)) {
            this.typeCounts.put(typeString, this.typeCounts.get(typeString) + 1);
        } else {
            this.typeCounts.put(typeString, 1);
        }
        if (this.useGoldstandardAnnotations && (this.goldstandardTypeSuffix.isEmpty() || type.endsWith(this.goldstandardTypeSuffix)) && !type.endsWith("MODIFIER")) {
            ++this.numberOfEntities;
            NamedEntity namedEntity = new NamedEntity(jcas);
            namedEntity.setBegin(begin);
            namedEntity.setEnd(end);
            namedEntity.setEntityType(ENTITY_TYPE);
            namedEntity.setConfidence(1.0);
            namedEntity.setSource("goldstandard");
            namedEntity.addToIndexes();
            if (end - begin < 3) {
                ++this.numberOfShortEntities;
            }
        }
    }
}

