/*
 * Decompiled with CFR 0.152.
 */
package se.lth.cs.srl.preprocessor.tokenization;

import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreLabel;
import java.io.File;
import java.util.List;
import java.util.Properties;
import se.lth.cs.srl.preprocessor.tokenization.Tokenizer;
import se.lth.cs.srl.util.FileExistenceVerifier;

public class StanfordChineseSegmenterWrapper
implements Tokenizer {
    private final CRFClassifier<CoreLabel> classifier;

    public StanfordChineseSegmenterWrapper(File dataDir) {
        File serDictionaryFile = new File(dataDir, "dict-chris6.ser.gz");
        File ctbFile = new File(dataDir, "ctb.gz");
        String error = FileExistenceVerifier.verifyFiles(serDictionaryFile, ctbFile);
        if (error != null) {
            throw new Error(error);
        }
        Properties props = new Properties();
        props.setProperty("sighanCorporaDict", dataDir.toString());
        props.setProperty("serDictionary", serDictionaryFile.toString());
        props.setProperty("inputEncoding", "UTF-8");
        props.setProperty("sighanPostProcessing", "true");
        this.classifier = new CRFClassifier(props);
        this.classifier.loadClassifierNoExceptions(ctbFile.toString(), props);
        this.classifier.flags.setProperties(props);
    }

    @Override
    public String[] tokenize(String sentence) {
        String[] tokens = (String[])this.classifier.segmentString(sentence).toArray();
        String[] withRoot = new String[tokens.length + 1];
        withRoot[0] = "<root>";
        System.arraycopy(tokens, 0, withRoot, 1, tokens.length);
        return withRoot;
    }

    public static void main(String[] args) throws Exception {
        args = new String[]{"chi-sen.deseg"};
        Properties props = new Properties();
        String dir = "/home/users0/anders/storage/scratch/anders/stanford-segmenter-2013-06-20/";
        props.setProperty("sighanCorporaDict", dir + "/data");
        props.setProperty("serDictionary", dir + "/data/dict-chris6.ser.gz");
        props.setProperty("inputEncoding", "UTF-8");
        props.setProperty("sighanPostProcessing", "true");
        CRFClassifier classifier = new CRFClassifier(props);
        classifier.loadClassifierNoExceptions(dir + "/data/ctb.gz", props);
        classifier.flags.setProperties(props);
        List forms = classifier.segmentString("\u4e0a\u6d77\u6d66\u4e1c\u8fd1\u5e74\u6765\u9881\u5e03\u5b9e\u884c\u4e86\u6d89\u53ca\u7ecf\u6d4e\u3001\u8d38\u6613\u3001\u5efa\u8bbe\u3001\u89c4\u5212\u3001\u79d1\u6280\u3001\u6587\u6559\u7b49\u9886\u57df\u7684\u4e03\u5341\u4e00\u4ef6\u6cd5\u89c4\u6027\u6587\u4ef6\uff0c\u786e\u4fdd\u4e86\u6d66\u4e1c\u5f00\u53d1\u7684\u6709\u5e8f\u8fdb\u884c\u3002");
        for (String form : forms) {
            System.out.println(form);
        }
    }
}

