/*
 * Decompiled with CFR 0.152.
 */
package se.lth.cs.srl.preprocessor;

import is2.data.SentenceData09;
import is2.lemmatizer.Lemmatizer;
import is2.parser.Parser;
import is2.tag.Tagger;
import is2.tools.Tool;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import se.lth.cs.srl.preprocessor.SimpleChineseLemmatizer;
import se.lth.cs.srl.preprocessor.tokenization.StanfordChineseSegmenterWrapper;
import se.lth.cs.srl.preprocessor.tokenization.Tokenizer;
import se.lth.cs.srl.util.BohnetHelper;
import se.lth.cs.srl.util.Util;

public class Preprocessor {
    protected final Tokenizer tokenizer;
    protected final Tool lemmatizer;
    protected final Tagger tagger;
    protected final is2.mtag.Tagger mtagger;
    protected final Parser parser;
    public long tokenizeTime = 0L;
    public long lemmatizeTime = 0L;
    public long tagTime = 0L;
    public long mtagTime = 0L;
    public long dpTime = 0L;

    public Preprocessor(Tokenizer tokenizer, Lemmatizer lemmatizer, Tagger tagger, is2.mtag.Tagger mtagger, Parser parser) {
        this.tokenizer = tokenizer;
        this.lemmatizer = lemmatizer;
        this.tagger = tagger;
        this.mtagger = mtagger;
        this.parser = parser;
    }

    public SentenceData09 preprocess(String[] forms) {
        SentenceData09 instance = new SentenceData09();
        instance.init(forms);
        return this.preprocess(instance);
    }

    public SentenceData09 preprocess(String[] forms, String[] lemmas, String[] poses) {
        SentenceData09 instance = new SentenceData09();
        instance.init(forms);
        instance.ppos = poses;
        instance.plemmas = lemmas;
        return this.preprocess(instance);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public SentenceData09 preprocess(SentenceData09 instance) {
        long start;
        if (this.lemmatizer != null) {
            start = System.currentTimeMillis();
            this.lemmatizer.apply(instance);
            this.lemmatizeTime += System.currentTimeMillis() - start;
        }
        if (this.tagger != null) {
            start = System.currentTimeMillis();
            this.tagger.apply(instance);
            this.tagTime += System.currentTimeMillis() - start;
        }
        if (this.mtagger != null) {
            start = System.currentTimeMillis();
            this.mtagger.apply(instance);
            for (int i = 1; i < instance.pfeats.length; ++i) {
                if (instance.pfeats[i] == null || instance.pfeats[i].equals("_")) continue;
                instance.feats[i] = instance.pfeats[i].split("\\|");
            }
            this.mtagTime += System.currentTimeMillis() - start;
        } else {
            instance.pfeats = new String[instance.forms.length];
            Arrays.fill(instance.pfeats, "_");
        }
        if (this.parser != null) {
            Parser parser = this.parser;
            synchronized (parser) {
                long start2 = System.currentTimeMillis();
                instance = this.parser.apply(instance);
                this.dpTime += System.currentTimeMillis() - start2;
            }
        } else {
            instance = new SentenceData09(instance);
        }
        return instance;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public String[] tokenize(String sentence) {
        Tokenizer tokenizer = this.tokenizer;
        synchronized (tokenizer) {
            long start = System.currentTimeMillis();
            String[] words = this.tokenizer.tokenize(sentence);
            this.tokenizeTime += System.currentTimeMillis() - start;
            return words;
        }
    }

    public StringBuilder getStatus() {
        StringBuilder sb = new StringBuilder();
        if (this.tokenizer != null) {
            sb.append("Tokenizer: " + this.tokenizer.getClass().getSimpleName()).append('\n');
        }
        sb.append("Tokenizer time:  " + Util.insertCommas(this.tokenizeTime)).append('\n');
        sb.append("Lemmatizer time: " + Util.insertCommas(this.lemmatizeTime)).append('\n');
        sb.append("Tagger time:     " + Util.insertCommas(this.tagTime)).append('\n');
        sb.append("MTagger time:    " + Util.insertCommas(this.mtagTime)).append('\n');
        sb.append("Parser time:     " + Util.insertCommas(this.dpTime)).append('\n');
        return sb;
    }

    public static void main(String[] args) throws Exception {
        String line;
        File desegmentedInput = new File("chi-desegmented.out");
        StanfordChineseSegmenterWrapper tokenizer = new StanfordChineseSegmenterWrapper(new File("/home/anders/Download/stanford-chinese-segmenter-2008-05-21/data"));
        SimpleChineseLemmatizer lemmatizer = new SimpleChineseLemmatizer();
        Tagger tagger = BohnetHelper.getTagger(new File("models/chi/tag-chn.model"));
        Preprocessor pp = new Preprocessor(tokenizer, lemmatizer, tagger, null, null);
        BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(desegmentedInput), "UTF-8"));
        while ((line = reader.readLine()) != null) {
            String[] tokens = pp.tokenize(line);
            SentenceData09 s = pp.preprocess(tokens);
            System.out.println(s);
        }
        reader.close();
    }

    public boolean hasParser() {
        return this.parser != null;
    }
}

