package cn.edu.bjut.chapter2; import java.util.ArrayList; import java.util.List; import java.util.Properties; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.WhitespaceTokenizer; public class Tokenizer { public static String[] tokenizeByWhitespace(String text) { WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE; return tokenizer.tokenize(text); } public static String[] tokenizeBySimple(String text) { SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; return tokenizer.tokenize(text); } public static List tokenizeByStanfordCoreNLP(String text) { // Set up the pipeline properties Properties props = new Properties(); props.setProperty("annotators", "tokenize"); props.setProperty("outputFormat", "text"); // Build the pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Create an annotation object Annotation annotation = new Annotation(text); // Annotate the text pipeline.annotate(annotation); // Retrieve the tokens List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); List tokens = new ArrayList(); for (CoreMap sentence : sentences) { tokens.addAll(sentence.get( CoreAnnotations.TokensAnnotation.class)); } return tokens; } public static void main(String[] args) { String text = "Natural Language Processing (NLP) is essential " + "for text analysis and text mining."; String[] tokensByWhitespace = tokenizeByWhitespace(text); System.out.println("Tokens tokenized by WhitespaceTokenizer: "); for (String token : tokensByWhitespace) { System.out.println(token); } String[] tokensBySimple = tokenizeBySimple(text); System.out.println("Tokens tokenized by SimpleTokenizer: "); for (String token : tokensBySimple) { System.out.println(token); } List tokensByStanfordCoreNLP = tokenizeByStanfordCoreNLP(text); System.out.println("Tokens tokenized by StanfordCoreNLP: "); for (CoreLabel token : tokensByStanfordCoreNLP) { System.out.println(token.word()); } } }