package cn.edu.bjut.chapter2; import java.util.List; import java.util.Properties; import java.util.regex.Pattern; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; public class SentenceSplitter { private static final String SPLITTER_REGEX = "(?<=[.!?])\\s+"; // 正则表达式规则 public static String[] splitByRule(String text) { Pattern pattern = Pattern.compile(SPLITTER_REGEX); return pattern.split(text); // 切分句子 } public static List splitByStanfordCoreNLP(String text) { // Set up the Stanford NLP pipeline Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit"); props.setProperty("outputFormat", "text"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Create an Annotation object Annotation annotation = new Annotation(text); // Annotate the document pipeline.annotate(annotation); // Get sentences return annotation.get(CoreAnnotations.SentencesAnnotation.class); } public static void main(String[] args) { String text = "Mr. Smith is here. This is a test! Isn't text mining " + "amazing? Let's see e.g. this case."; String[] sentencesByRule = splitByRule(text); System.out.println("Sentences splitted by rules: "); for (String sentence : sentencesByRule) { System.out.println(sentence); } List sentencesByStanfordCoreNLP = splitByStanfordCoreNLP(text); System.out.println("Sentences splitted by Stanford CoreNLP: "); for (CoreMap sentence : sentencesByStanfordCoreNLP) { System.out.println(sentence.toString()); } } }