/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.pipe;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;

public class SimpleTokenizer
extends Pipe {
    public static final int USE_EMPTY_STOPLIST = 0;
    public static final int USE_DEFAULT_ENGLISH_STOPLIST = 1;
    protected HashSet<String> stoplist;
    static final long serialVersionUID = 1L;

    public SimpleTokenizer(int languageFlag) {
        this.stoplist = new HashSet();
        if (languageFlag == 1) {
            this.stop("the");
            this.stop("a");
            this.stop("an");
            this.stop("and");
            this.stop("or");
            this.stop("of");
            this.stop("for");
            this.stop("in");
            this.stop("on");
            this.stop("to");
            this.stop("with");
            this.stop("by");
            this.stop("this");
            this.stop("that");
            this.stop("these");
            this.stop("those");
            this.stop("some");
            this.stop("other");
            this.stop("it");
            this.stop("its");
            this.stop("we");
            this.stop("our");
            this.stop("as");
            this.stop("but");
            this.stop("not");
            this.stop("do");
            this.stop("does");
            this.stop("is");
            this.stop("be");
            this.stop("are");
            this.stop("can");
            this.stop("was");
            this.stop("were");
        }
    }

    public SimpleTokenizer(File stopfile) {
        this.stoplist = new HashSet();
        try {
            BufferedReader in = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(stopfile), "UTF-8"));
            String word = null;
            while ((word = in.readLine()) != null) {
                this.stop(word);
            }
            in.close();
        }
        catch (Exception e) {
            System.err.println("problem loading stoplist: " + e);
        }
    }

    public SimpleTokenizer(HashSet<String> stoplist) {
        this.stoplist = stoplist;
    }

    public SimpleTokenizer deepClone() {
        return new SimpleTokenizer((HashSet)this.stoplist.clone());
    }

    public void stop(String word) {
        this.stoplist.add(word);
    }

    @Override
    public Instance pipe(Instance instance) {
        ArrayList<String> tokens;
        if (instance.getData() instanceof CharSequence) {
            String token2;
            CharSequence characters2 = (CharSequence)instance.getData();
            tokens = new ArrayList<String>();
            int[] tokenBuffer = new int[1000];
            int length = -1;
            int totalCodePoints = Character.codePointCount(characters2, 0, characters2.length());
            for (int i = 0; i < totalCodePoints; ++i) {
                int codePoint = Character.codePointAt(characters2, i);
                int codePointType = Character.getType(codePoint);
                if (codePointType == 2 || codePointType == 1) {
                    tokenBuffer[++length] = codePoint;
                    continue;
                }
                if (codePointType == 12 || codePointType == 13 || codePointType == 14 || codePointType == 22 || codePointType == 20 || codePointType == 23 || codePointType == 21 || codePointType == 29 || codePointType == 30 || codePointType == 24) {
                    if (length == -1) continue;
                    String token3 = new String(tokenBuffer, 0, length + 1);
                    if (!this.stoplist.contains(token3)) {
                        tokens.add(token3);
                    }
                    length = -1;
                    continue;
                }
                if (codePointType != 8 && codePointType != 7 && codePointType != 6 && codePointType != 3 && codePointType != 4 && codePointType != 5) continue;
                tokenBuffer[++length] = codePoint;
            }
            if (length != -1 && !this.stoplist.contains(token2 = new String(tokenBuffer, 0, length + 1))) {
                tokens.add(token2);
            }
        } else {
            throw new IllegalArgumentException("Looking for a CharSequence, found a " + instance.getData().getClass());
        }
        instance.setData(tokens);
        return instance;
    }
}

