下载:文本表示
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>cn.edu.bjut</groupId> <artifactId>text-mining</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>text-mining</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> <version>1.9.3</version> </dependency> <dependency> <groupId>edu.stanford.nlp</groupId> <artifactId>stanford-corenlp</artifactId> <version>4.5.7</version> </dependency> <dependency> <groupId>edu.stanford.nlp</groupId> <artifactId>stanford-corenlp</artifactId> <version>4.5.7</version> <classifier>models</classifier> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.12</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-text</artifactId> <version>1.9</version> </dependency> <dependency> <groupId>org.deeplearning4j</groupId> <artifactId>deeplearning4j-core</artifactId> <version>1.0.0-M1.1</version> </dependency> <dependency> <groupId>org.nd4j</groupId> <artifactId>nd4j-native-platform</artifactId> <version>1.0.0-M1.1</version> </dependency> <dependency> <groupId>org.deeplearning4j</groupId> <artifactId>deeplearning4j-nlp</artifactId> <version>1.0.0-M1.1</version> </dependency> </dependencies> </project>
package cn.edu.bjut.chapter2; import java.util.List; import java.util.Properties; import java.util.regex.Pattern; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; public class SentenceSplitter { private static final String SPLITTER_REGEX = "(?<=[.!?])\\s+"; // 正则表达式规则 public static String[] splitByRule(String text) { Pattern pattern = Pattern.compile(SPLITTER_REGEX); return pattern.split(text); // 切分句子 } public static List<CoreMap> splitByStanfordCoreNLP(String text) { // Set up the Stanford NLP pipeline Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit"); props.setProperty("outputFormat", "text"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Create an Annotation object Annotation annotation = new Annotation(text); // Annotate the document pipeline.annotate(annotation); // Get sentences return annotation.get(CoreAnnotations.SentencesAnnotation.class); } public static void main(String[] args) { String text = "Mr. Smith is here. This is a test! Isn't text mining " + "amazing? Let's see e.g. this case."; String[] sentencesByRule = splitByRule(text); System.out.println("Sentences splitted by rules: "); for (String sentence : sentencesByRule) { System.out.println(sentence); } List<CoreMap> sentencesByStanfordCoreNLP = splitByStanfordCoreNLP(text); System.out.println("Sentences splitted by Stanford CoreNLP: "); for (CoreMap sentence : sentencesByStanfordCoreNLP) { System.out.println(sentence.toString()); } } }
运行时如果出现以下错误揭示:
SLF4J: Failed to load class “org.slf4j.impl.StaticLoggerBinder”.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
根据http://www.slf4j.org/codes.html#StaticLoggerBinder的建议,可将slf4j-nop.jar、slf4j-simple.jar、slf4j-log4j12.jar、slf4j-jdk14.jar、logback-classic.jar任意一个添加到classpath路径。
package cn.edu.bjut.chapter2; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; public class SentenceSpliterByOpenNLP { private static SentenceDetectorME detector = null; private static final String MODEL_FILE = "resource/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"; private static void loadModel(String fname) { SentenceModel model = null; try { InputStream stream = new FileInputStream(fname); try { model = new SentenceModel(stream); } catch (IOException e) { e.printStackTrace(); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { e.printStackTrace(); } } } detector = new SentenceDetectorME(model); } catch (FileNotFoundException e) { e.printStackTrace(); } } public static String[] detect(String str) { if (detector == null) { loadModel(MODEL_FILE); } return detector.sentDetect(str); } public static void main(String[] args) { String str = "Mr. Smith is here. This is a test! Isn't text mining " + "amazing? Let's see e.g. this case."; String[] sentences = detect(str); System.out.println("Sentences splitted by OpenNLP: "); for (String sentence: sentences) { System.out.println(sentence); } } }
package cn.edu.bjut.chapter2; import java.util.ArrayList; import java.util.List; import java.util.Properties; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.WhitespaceTokenizer; public class Tokenizer { public static String[] tokenizeByWhitespace(String text) { WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE; return tokenizer.tokenize(text); } public static String[] tokenizeBySimple(String text) { SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; return tokenizer.tokenize(text); } public static List<CoreLabel> tokenizeByStanfordCoreNLP(String text) { // Set up the pipeline properties Properties props = new Properties(); props.setProperty("annotators", "tokenize"); props.setProperty("outputFormat", "text"); // Build the pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Create an annotation object Annotation annotation = new Annotation(text); // Annotate the text pipeline.annotate(annotation); // Retrieve the tokens List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); List<CoreLabel> tokens = new ArrayList<CoreLabel>(); for (CoreMap sentence : sentences) { tokens.addAll(sentence.get( CoreAnnotations.TokensAnnotation.class)); } return tokens; } public static void main(String[] args) { String text = "Natural Language Processing (NLP) is essential " + "for text analysis and text mining."; String[] tokensByWhitespace = tokenizeByWhitespace(text); System.out.println("Tokens tokenized by WhitespaceTokenizer: "); for (String token : tokensByWhitespace) { System.out.println(token); } String[] tokensBySimple = tokenizeBySimple(text); System.out.println("Tokens tokenized by SimpleTokenizer: "); for (String token : tokensBySimple) { System.out.println(token); } List<CoreLabel> tokensByStanfordCoreNLP = tokenizeByStanfordCoreNLP(text); System.out.println("Tokens tokenized by StanfordCoreNLP: "); for (CoreLabel token : tokensByStanfordCoreNLP) { System.out.println(token.word()); } } }
package cn.edu.bjut.chapter2; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; public class TokenizerByOpenNLP { private static TokenizerME tokenizer = null; private static final String MODEL_FILE = "resource/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin"; private static void loadModel(String fname) { TokenizerModel model = null; try { InputStream stream = new FileInputStream(fname); try { model = new TokenizerModel(stream); } catch (IOException e) { e.printStackTrace(); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { e.printStackTrace(); } } } tokenizer = new TokenizerME(model); } catch (FileNotFoundException e) { e.printStackTrace(); } } public static String[] tokenize(String str) { if (tokenizer == null) { loadModel(MODEL_FILE); } return tokenizer.tokenize(str); } public static void main(String[] args) { String str = "Natural Language Processing (NLP) is essential " + "for text analysis and text mining."; String[] tokens = tokenize(str); System.out.println("Tokens tokenized by OpenNLP: "); for (int i = 0; i < tokens.length; i++) { System.out.println(tokens[i]); } } }
package cn.edu.bjut.chapter2; import java.util.ArrayList; import java.util.List; import java.util.Properties; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; public class PoSTaggerByStanfordCoreNLP { public static List<CoreLabel> tag(String text) { // Set up the pipeline properties Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos"); props.setProperty("outputFormat", "text"); // Build the pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Create an annotation object Annotation annotation = new Annotation(text); // Annotate the text pipeline.annotate(annotation); // Retrieve the tokens List<CoreMap> sentences = annotation.get( CoreAnnotations.SentencesAnnotation.class); List<CoreLabel> tokens = new ArrayList<CoreLabel>(); for (CoreMap sentence : sentences) { tokens.addAll(sentence.get(CoreAnnotations.TokensAnnotation.class)); } return tokens; } public static void main(String[] args) { String text = "Natural Language Processing (NLP) is essential for text " + "analysis and text mining."; List<CoreLabel> tokens = tag(text); for (CoreLabel token : tokens) { System.out.println(token.word() + ": " + token.tag()); } } }
package cn.edu.bjut.chapter2; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTaggerME; public class PoSTaggerByOpenNLP { private static POSTaggerME tagger = null; private static final String MODEL_FILE = "resource/opennlp-en-ud-ewt-pos-1.0-1.9.3.bin"; private static void loadModel(String fname) { POSModel model = null; try { InputStream stream = new FileInputStream(fname); try { model = new POSModel(stream); } catch (IOException e) { e.printStackTrace(); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { e.printStackTrace(); } } } tagger = new POSTaggerME(model); } catch (FileNotFoundException e) { e.printStackTrace(); } } public static String[] tag(String[] tokens) { if (tagger == null) { loadModel(MODEL_FILE); } return tagger.tag(tokens); } public static void main(String[] args) { String tokens[] = new String[] { "Most", "large", "cities", "in", "the", "US", "had", "morning", "and", "afternoon", "newspapers", "." }; String tags[] = tag(tokens); for (int i = 0; i < tags.length; i++) { System.out.println(tokens[i] + "\t" + tags[i]); } } }
package cn.edu.bjut.chapter2; import opennlp.tools.stemmer.PorterStemmer; public class Stemmer { public static String stem(String token) { PorterStemmer stemmer = new PorterStemmer(); return stemmer.stem(token); } public static void main(String[] args) { String[] words = { "running", "jumps", "better", "happily" }; for (String word : words) { String stem = stem(word); System.out.println(word + "-->" + stem); } } }
package cn.edu.bjut.chapter2; import java.util.ArrayList; import java.util.List; import java.util.Properties; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; public class Lemmatizer { public static List<CoreLabel> tag(String text) { // Set up the pipeline properties Properties props = new Properties(); props.setProperty("annotators", "tokenize,ssplit,pos,lemma"); props.setProperty("outputFormat", "text"); // Build the pipeline StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // Create an annotation object Annotation annotation = new Annotation(text); // Annotate the text pipeline.annotate(annotation); // Retrieve the tokens List<CoreMap> sentences = annotation.get( CoreAnnotations.SentencesAnnotation.class); List<CoreLabel> tokens = new ArrayList<CoreLabel>(); for (CoreMap sentence : sentences) { tokens.addAll(sentence.get(CoreAnnotations.TokensAnnotation.class)); } return tokens; } public static void main(String[] args) { String text = "Natural Language Processing (NLP) is essential " + "for text analysis and text mining."; List<CoreLabel> tokens = tag(text); for (CoreLabel token : tokens) { System.out.println(token.word() + "-->" + token.lemma()); } } }
package cn.edu.bjut.chapter2; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashSet; import java.util.Set; public class StopwordChecker { private Set<String> stopwords; private String fname = "resource/stoplist.txt"; private boolean caseSensitive; public StopwordChecker(boolean caseSensitive) { this.stopwords = new HashSet<String>(); this.caseSensitive = caseSensitive; load(this.fname); } public StopwordChecker() { this(false); } private void load(final String fname) { try { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(fname), "UTF-8")); for (String line = null; (line = reader.readLine()) != null;) { line = line.trim(); stopwords.add(caseSensitive ? line : line.toLowerCase()); } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public boolean check(String word) { return stopwords.contains(caseSensitive ? word : word.toLowerCase()); } public static void main(String[] args) { StopwordChecker checker = new StopwordChecker(); String[] tokens = { "Text", "analysis", "and", "text", "mining", "are", "amazing", "!" }; for (String token : tokens) { System.out.println(token + ": " + checker.check(token)); } } }
package cn.edu.bjut.chapter2; import java.util.HashMap; import java.util.Map; public class Document { private String content; private Map<String, Double> tfMap; private StopwordChecker stopwordChecker; public Document(String content) { this.content = content; this.stopwordChecker = new StopwordChecker(); calculateTermFrequency(); } private void calculateTermFrequency() { this.tfMap = new HashMap<String, Double>(); int total = 0; for (String sentence : SentenceSpliterByOpenNLP.detect(content)) { for (String token : TokenizerByOpenNLP.tokenize(sentence)) { if (!stopwordChecker.check(token)) { total++; token = token.toLowerCase(); tfMap.put(token, tfMap.getOrDefault(token, 0.0) + 1); } } } for (String key : tfMap.keySet()) { tfMap.put(key, tfMap.get(key) / total); } } public Map<String, Double> getTermFrequency() { return tfMap; } }
package cn.edu.bjut.chapter2; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; public class TFIDFAnalyzer { private List<Document> documents; public TFIDFAnalyzer(List<Document> documents) { this.documents = documents; } public Map<String, Double> calculateIDF() { Map<String, Double> idf = new HashMap<String, Double>(); int totalDocuments = documents.size(); for (Document document : documents) { Set<String> tokens = document.getTermFrequency().keySet(); for (String token : tokens) { idf.put(token, idf.getOrDefault(token, 0.0) + 1); } } for (String key : idf.keySet()) { idf.put(key, Math.log(totalDocuments / idf.get(key))); } return idf; } public Map<String, Double> calculateTFIDF() { Map<String, Double> tfidf = new HashMap<String, Double>(); Map<String, Double> idf = calculateIDF(); for (Document document : documents) { Map<String, Double> tf = document.getTermFrequency(); for (String term : tf.keySet()) { double tfidfValue = tf.get(term) * idf.getOrDefault(term, 0.0); tfidf.put(term, tfidf.getOrDefault(term, 0.0) + tfidfValue); } } return tfidf; } public static void main(String[] args) { List<Document> documents = new ArrayList<Document>(); documents.add(new Document("A rose is very beautiful.")); documents.add(new Document("A rose is a flower, which is a rose.")); TFIDFAnalyzer tfidfAnalyzer = new TFIDFAnalyzer(documents); Map<String, Double> tfidf = tfidfAnalyzer.calculateTFIDF(); for (Map.Entry<String, Double> entry : tfidf.entrySet()) { System.out.println(entry.getKey() + ": " + entry.getValue()); } } }
package cn.edu.bjut.chapter2; import java.io.File; import java.util.Arrays; import java.util.Collection; import org.deeplearning4j.models.word2vec.Word2Vec; import org.deeplearning4j.text.documentiterator.FileDocumentIterator; public class Word2VecExample { public static void main(String[] args) throws Exception { // Step 1: Prepare the Dataset String trainFile = "data/train.txt"; FileDocumentIterator iterator = new FileDocumentIterator(new File(trainFile)); // Step 2: Build the Word2Vec Model Word2Vec word2Vec = new Word2Vec.Builder() .iterate(iterator).epochs(5) .layerSize(100).minWordFrequency(1) .seed(42).build(); // Step 3: Fit the Model word2Vec.fit(); // Get a word vector double[] vector = word2Vec.getWordVector("Java"); System.out.println("Java's Vector: " + Arrays.toString(vector)); // Find closest words to 'Java' System.out.println("Closest words to 'Java': "); Collection<String> nearestWords = word2Vec.wordsNearest("Java", 5); System.out.println(nearestWords); } }
评论
希望能够直接提供bin文件而不是白花费半天时间去解决一个没有太大意义的问题。
bin文件官网不提供1.0版本,全都是“opennlp-en-ud-ewt-sentence-1.3-2.5.4.bin”版本。并且如果修改pom文件会导致版本不兼容。
兄弟们,bin文件不兼容问题解决了:将java8改为java17就可以了!!!