package cn.edu.bjut.chapter2;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.WhitespaceTokenizer;

public class Tokenizer {
	public static String[] tokenizeByWhitespace(String text) {
		WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
		return tokenizer.tokenize(text); 
	}
	
	public static String[] tokenizeBySimple(String text) {
		SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
		return tokenizer.tokenize(text); 
	}
	
	public static List<CoreLabel> tokenizeByStanfordCoreNLP(String text) {
		// Set up the pipeline properties
		Properties props = new Properties();
		props.setProperty("annotators", "tokenize");
		props.setProperty("outputFormat", "text");

		// Build the pipeline
		StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
		// Create an annotation object
		Annotation annotation = new Annotation(text);
		// Annotate the text
		pipeline.annotate(annotation);

		// Retrieve the tokens
		List<CoreMap> sentences = 
				annotation.get(CoreAnnotations.SentencesAnnotation.class);
		List<CoreLabel> tokens = new ArrayList<CoreLabel>(); 
		for (CoreMap sentence : sentences) {
			tokens.addAll(sentence.get(
					CoreAnnotations.TokensAnnotation.class)); 
		}
		
		return tokens;
	}
	
	public static void main(String[] args) {
		String text = "Natural Language Processing (NLP) is essential "
				+ "for text analysis and text mining."; 
		
		String[] tokensByWhitespace = tokenizeByWhitespace(text); 
		System.out.println("Tokens tokenized by WhitespaceTokenizer: ");
		for (String token : tokensByWhitespace) {
			System.out.println(token);
		}
		
		String[] tokensBySimple = tokenizeBySimple(text); 
		System.out.println("Tokens tokenized by SimpleTokenizer: ");
		for (String token : tokensBySimple) {
			System.out.println(token);
		}
		
		List<CoreLabel> tokensByStanfordCoreNLP = 
				tokenizeByStanfordCoreNLP(text); 
		System.out.println("Tokens tokenized by StanfordCoreNLP: ");
		for (CoreLabel token : tokensByStanfordCoreNLP) {
			System.out.println(token.word());
		}
	}
}