package cn.edu.bjut.chapter2;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;

public class TokenizerByOpenNLP {
	private static TokenizerME tokenizer = null; 
	private static final String MODEL_FILE = 
			"resource/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin"; 
	
	private static void loadModel(String fname) {
		TokenizerModel model = null; 
		try {
			InputStream stream = new FileInputStream(fname); 
			
			try {
				model = new TokenizerModel(stream); 
			} catch (IOException e) {
				e.printStackTrace(); 
			} finally {
				if (stream != null) {
					try {
						stream.close(); 
					} catch (IOException e) {
						e.printStackTrace(); 
					}
				}
			}
			
			tokenizer = new TokenizerME(model); 
		} catch (FileNotFoundException e) {
			e.printStackTrace(); 
		}
	}
	
	public static String[] tokenize(String str) {
		if (tokenizer == null) {
			loadModel(MODEL_FILE); 
		}
		
		return tokenizer.tokenize(str); 
	}
	
	public static void main(String[] args) {
		String str = "Natural Language Processing (NLP) is essential "
				+ "for text analysis and text mining.";
		String[] tokens = tokenize(str); 
		System.out.println("Tokens tokenized by OpenNLP: ");
		for (int i = 0; i < tokens.length; i++) {
			System.out.println(tokens[i]); 
		}
	}
}