/*
 * Decompiled with CFR 0.152.
 */
package opennlp.tools.tokenize;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.maxent.EventStream;
import opennlp.maxent.GIS;
import opennlp.maxent.GISModel;
import opennlp.maxent.MaxentModel;
import opennlp.maxent.TwoPassDataIndexer;
import opennlp.maxent.io.SuffixSensitiveGISModelWriter;
import opennlp.tools.lang.english.TokenStream;
import opennlp.tools.tokenize.AbstractTokenizer;
import opennlp.tools.tokenize.DefaultTokenContextGenerator;
import opennlp.tools.tokenize.TokenContextGenerator;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.Span;

public class TokenizerME
extends AbstractTokenizer {
    private MaxentModel model;
    private final TokenContextGenerator cg = new DefaultTokenContextGenerator();
    private static final Double ONE = new Double(1.0);
    public static final Pattern alphaNumeric = Pattern.compile("^[A-Za-z0-9]+$");
    private boolean ALPHA_NUMERIC_OPTIMIZATION;
    private List tokProbs;
    private List newTokens;

    public TokenizerME(MaxentModel mod) {
        this.setAlphaNumericOptimization(false);
        this.model = mod;
        this.newTokens = new ArrayList();
        this.tokProbs = new ArrayList(50);
    }

    public double[] getTokenProbabilities() {
        double[] tokProbArray = new double[this.tokProbs.size()];
        for (int i = 0; i < tokProbArray.length; ++i) {
            tokProbArray[i] = (Double)this.tokProbs.get(i);
        }
        return tokProbArray;
    }

    public Span[] tokenizePos(String d) {
        Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d);
        this.newTokens.clear();
        this.tokProbs.clear();
        int il = tokens.length;
        for (int i = 0; i < il; ++i) {
            Span s = tokens[i];
            String tok = d.substring(s.getStart(), s.getEnd());
            if (tok.length() < 2) {
                this.newTokens.add(s);
                this.tokProbs.add(ONE);
                continue;
            }
            if (this.useAlphaNumericOptimization() && alphaNumeric.matcher(tok).matches()) {
                this.newTokens.add(s);
                this.tokProbs.add(ONE);
                continue;
            }
            int start = s.getStart();
            int end = s.getEnd();
            int origStart = s.getStart();
            double tokenProb = 1.0;
            for (int j = origStart + 1; j < end; ++j) {
                double[] probs = this.model.eval(this.cg.getContext(tok, j - origStart));
                String best = this.model.getBestOutcome(probs);
                tokenProb *= probs[this.model.getIndex(best)];
                if (!best.equals("T")) continue;
                this.newTokens.add(new Span(start, j));
                this.tokProbs.add(new Double(tokenProb));
                start = j;
                tokenProb = 1.0;
            }
            this.newTokens.add(new Span(start, end));
            this.tokProbs.add(new Double(tokenProb));
        }
        Span[] spans = new Span[this.newTokens.size()];
        this.newTokens.toArray(spans);
        return spans;
    }

    public static GISModel train(EventStream evc) throws IOException {
        return TokenizerME.train(evc, 100, 5);
    }

    public static GISModel train(EventStream evc, int iterations, int cutoff) throws IOException {
        return GIS.trainModel(iterations, new TwoPassDataIndexer(evc, cutoff));
    }

    public static void train(EventStream evc, File output2, String encoding) throws IOException {
        TokenizerME.train(evc, output2, 100, 5, encoding);
    }

    public static void train(EventStream evc, File output2, int iterations, int cutoff, String encoding) throws IOException {
        new SuffixSensitiveGISModelWriter(TokenizerME.train(evc, iterations, cutoff), output2).persist();
    }

    public void setAlphaNumericOptimization(boolean opt2) {
        this.ALPHA_NUMERIC_OPTIMIZATION = opt2;
    }

    public boolean useAlphaNumericOptimization() {
        return this.ALPHA_NUMERIC_OPTIMIZATION;
    }

    private static void usage() {
        System.err.println("Usage: TokenizerME model [cutoff] [iterations] < training");
        System.err.println("This trains a new model on the specified space delimited tokens, one-sentence-per-line input and outpus the trained model to the model file.");
        System.exit(1);
    }

    public static void main(String[] args) throws IOException {
        if (args.length == 0) {
            TokenizerME.usage();
        }
        int ai = 0;
        File outFile = new File(args[ai++]);
        int cutoff = 5;
        int iterations = 100;
        if (args.length > ai) {
            cutoff = Integer.parseInt(args[ai++]);
            iterations = Integer.parseInt(args[ai++]);
        }
        TokenizerME.train(new TokenStream(System.in), outFile, iterations, cutoff, "UTF8");
    }
}

