/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.util;

import cc.mallet.pipe.CharSequenceLowercase;
import cc.mallet.pipe.FeatureCountPipe;
import cc.mallet.pipe.FeatureDocFreqPipe;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.SimpleTokenizer;
import cc.mallet.pipe.StringList2FeatureSequence;
import cc.mallet.pipe.iterator.CsvIterator;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CommandOption;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;

public class BulkLoader {
    static CommandOption.File inputFile = new CommandOption.File(BulkLoader.class, "input", "FILE", true, null, "The file containing data, one instance per line", null);
    static CommandOption.File outputFile = new CommandOption.File(BulkLoader.class, "output", "FILE", true, new File("mallet.data"), "Write the instance list to this file", null);
    static CommandOption.Boolean preserveCase = new CommandOption.Boolean(BulkLoader.class, "preserve-case", "[TRUE|FALSE]", false, false, "If true, do not force all strings to lowercase.", null);
    static CommandOption.Boolean removeStopWords = new CommandOption.Boolean(BulkLoader.class, "remove-stopwords", "[TRUE|FALSE]", false, false, "If true, remove common \"stop words\" from the text.\nThis option invokes a minimal English stoplist. ", null);
    static CommandOption.File stoplistFile = new CommandOption.File(BulkLoader.class, "stoplist", "FILE", true, null, "Read newline-separated words from this file,\n   and remove them from text. This option overrides\n   the default English stoplist triggered by --remove-stopwords.", null);
    static CommandOption.Boolean keepSequence = new CommandOption.Boolean(BulkLoader.class, "keep-sequence", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
    static CommandOption.String lineRegex = new CommandOption.String(BulkLoader.class, "line-regex", "REGEX", true, "^([^\\t]*)\\t([^\\t]*)\\t(.*)", "Regular expression containing regex-groups for label, name and data.", null);
    static CommandOption.Integer nameGroup = new CommandOption.Integer(BulkLoader.class, "name", "INTEGER", true, 1, "The index of the group containing the instance name.\n   Use 0 to indicate that this field is not used.", null);
    static CommandOption.Integer labelGroup = new CommandOption.Integer(BulkLoader.class, "label", "INTEGER", true, 2, "The index of the group containing the label string.\n   Use 0 to indicate that this field is not used.", null);
    static CommandOption.Integer dataGroup = new CommandOption.Integer(BulkLoader.class, "data", "INTEGER", true, 3, "The index of the group containing the data.", null);
    static CommandOption.Integer pruneCount = new CommandOption.Integer(BulkLoader.class, "prune-count", "N", false, 0, "Reduce features to those that occur more than N times.", null);
    static CommandOption.Double docProportionCutoff = new CommandOption.Double(BulkLoader.class, "prune-doc-frequency", "N", false, 1.0, "Remove features that occur in more than (X*100)% of documents. 0.05 is equivalent to IDF of 3.0.", null);

    public static void generateStoplist(SimpleTokenizer prunedTokenizer) throws IOException {
        CsvIterator reader = new CsvIterator((Reader)new FileReader(BulkLoader.inputFile.value), BulkLoader.lineRegex.value, BulkLoader.dataGroup.value, BulkLoader.labelGroup.value, BulkLoader.nameGroup.value);
        ArrayList<Pipe> pipes = new ArrayList<Pipe>();
        Alphabet alphabet = new Alphabet();
        CharSequenceLowercase csl = new CharSequenceLowercase();
        SimpleTokenizer st = prunedTokenizer.deepClone();
        StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
        FeatureCountPipe featureCounter = new FeatureCountPipe(alphabet, null);
        FeatureDocFreqPipe docCounter = new FeatureDocFreqPipe(alphabet, null);
        if (!BulkLoader.preserveCase.value) {
            pipes.add(csl);
        }
        pipes.add(st);
        pipes.add(sl2fs);
        if (BulkLoader.pruneCount.value > 0) {
            pipes.add(featureCounter);
        }
        if (BulkLoader.docProportionCutoff.value < 1.0) {
            pipes.add(docCounter);
        }
        SerialPipes serialPipe = new SerialPipes(pipes);
        Iterator<Instance> iterator2 = ((Pipe)serialPipe).newIteratorFrom(reader);
        int count2 = 0;
        while (iterator2.hasNext()) {
            if (++count2 % 100000 == 0) {
                System.out.println(count2);
            }
            iterator2.next();
        }
        if (BulkLoader.pruneCount.value > 0) {
            featureCounter.addPrunedWordsToStoplist(prunedTokenizer, BulkLoader.pruneCount.value);
        }
        if (BulkLoader.docProportionCutoff.value < 1.0) {
            docCounter.addPrunedWordsToStoplist(prunedTokenizer, BulkLoader.docProportionCutoff.value);
        }
    }

    public static void writeInstanceList(SimpleTokenizer prunedTokenizer) throws IOException {
        CsvIterator reader = new CsvIterator((Reader)new FileReader(BulkLoader.inputFile.value), BulkLoader.lineRegex.value, BulkLoader.dataGroup.value, BulkLoader.labelGroup.value, BulkLoader.nameGroup.value);
        ArrayList<Pipe> pipes = new ArrayList<Pipe>();
        Alphabet alphabet = new Alphabet();
        CharSequenceLowercase csl = new CharSequenceLowercase();
        StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
        if (!BulkLoader.preserveCase.value) {
            pipes.add(csl);
        }
        pipes.add(prunedTokenizer);
        pipes.add(sl2fs);
        SerialPipes serialPipe = new SerialPipes(pipes);
        InstanceList instances = new InstanceList(serialPipe);
        instances.addThruPipe(reader);
        instances.save(BulkLoader.outputFile.value);
    }

    public static void main(String[] args) throws IOException {
        CommandOption.setSummary(BulkLoader.class, "Efficient tool for importing large amounts of text into Mallet format");
        CommandOption.process(BulkLoader.class, args);
        SimpleTokenizer tokenizer = null;
        tokenizer = BulkLoader.stoplistFile.value != null ? new SimpleTokenizer(BulkLoader.stoplistFile.value) : (BulkLoader.removeStopWords.value ? new SimpleTokenizer(1) : new SimpleTokenizer(0));
        if (BulkLoader.pruneCount.value > 0 || BulkLoader.docProportionCutoff.value < 1.0) {
            BulkLoader.generateStoplist(tokenizer);
        }
        BulkLoader.writeInstanceList(tokenizer);
    }
}

