第二章：文本表示

课件

POM配置文件

<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
 
	<groupId>cn.edu.bjut</groupId>
	<artifactId>text-mining</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>
 
	<name>text-mining</name>
	<url>http://maven.apache.org</url>
 
	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	</properties>
 
	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>3.8.1</version>
			<scope>test</scope>
		</dependency>
 
		<dependency>
			<groupId>org.apache.opennlp</groupId>
			<artifactId>opennlp-tools</artifactId>
			<version>1.9.3</version>
		</dependency>
 
		<dependency>
			<groupId>edu.stanford.nlp</groupId>
			<artifactId>stanford-corenlp</artifactId>
			<version>4.5.7</version>
		</dependency>
 
		<dependency>
			<groupId>edu.stanford.nlp</groupId>
			<artifactId>stanford-corenlp</artifactId>
			<version>4.5.7</version>
			<classifier>models</classifier>
		</dependency>
 
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>1.7.12</version>
		</dependency>
 
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-text</artifactId>
			<version>1.9</version>
		</dependency>
 
		<dependency>
			<groupId>org.deeplearning4j</groupId>
			<artifactId>deeplearning4j-core</artifactId>
			<version>1.0.0-M1.1</version>
		</dependency>
 
		<dependency>
			<groupId>org.nd4j</groupId>
			<artifactId>nd4j-native-platform</artifactId>
			<version>1.0.0-M1.1</version>
		</dependency>
 
		<dependency>
			<groupId>org.deeplearning4j</groupId>
			<artifactId>deeplearning4j-nlp</artifactId>
			<version>1.0.0-M1.1</version>
		</dependency>
	</dependencies>
</project>

句子切分

基于规则和Stanford CoreNLP

SentenceSplitter.java

package cn.edu.bjut.chapter2;
 
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
 
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
 
public class SentenceSplitter {
	private static final String SPLITTER_REGEX = "(?<=[.!?])\\s+"; // 正则表达式规则
 
	public static String[] splitByRule(String text) {
		Pattern pattern = Pattern.compile(SPLITTER_REGEX);
 
		return pattern.split(text); // 切分句子
	}
 
	public static List<CoreMap> splitByStanfordCoreNLP(String text) {
		// Set up the Stanford NLP pipeline
		Properties props = new Properties();
		props.setProperty("annotators", "tokenize,ssplit");
		props.setProperty("outputFormat", "text");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 
		// Create an Annotation object
		Annotation annotation = new Annotation(text); 
		// Annotate the document
		pipeline.annotate(annotation); 
		// Get sentences
		return annotation.get(CoreAnnotations.SentencesAnnotation.class); 
	}
 
	public static void main(String[] args) {
		String text = "Mr. Smith is here. This is a test! Isn't text mining "
				+ "amazing? Let's see e.g. this case.";
 
		String[] sentencesByRule = splitByRule(text);
		System.out.println("Sentences splitted by rules: ");
		for (String sentence : sentencesByRule) {
			System.out.println(sentence);
		}
 
		List<CoreMap> sentencesByStanfordCoreNLP = splitByStanfordCoreNLP(text);
		System.out.println("Sentences splitted by Stanford CoreNLP: ");
		for (CoreMap sentence : sentencesByStanfordCoreNLP) {
			System.out.println(sentence.toString());
		}
	}
}

运行时如果出现以下错误揭示：
SLF4J: Failed to load class “org.slf4j.impl.StaticLoggerBinder”.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.

根据http://www.slf4j.org/codes.html#StaticLoggerBinder的建议，可将slf4j-nop.jar、slf4j-simple.jar、slf4j-log4j12.jar、slf4j-jdk14.jar、logback-classic.jar任意一个添加到classpath路径。

基于OpenNLP

SentenceSpliterByOpenNLP.java

package cn.edu.bjut.chapter2;
 
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
 
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
 
public class SentenceSpliterByOpenNLP {
	private static SentenceDetectorME detector = null; 
	private static final String MODEL_FILE = 
			"resource/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"; 
 
	private static void loadModel(String fname) {
		SentenceModel model = null; 
		try {
			InputStream stream = new FileInputStream(fname); 
 
			try {
				model = new SentenceModel(stream); 
			} catch (IOException e) {
				e.printStackTrace(); 
			} finally {
				if (stream != null) {
					try {
						stream.close(); 
					} catch (IOException e) {
						e.printStackTrace(); 
					}
				}
			}
 
			detector = new SentenceDetectorME(model); 
		} catch (FileNotFoundException e) {
			e.printStackTrace(); 
		}
	}
 
	public static String[] detect(String str) {
		if (detector == null) {
			loadModel(MODEL_FILE); 
		}
 
		return detector.sentDetect(str); 
	}
 
	public static void main(String[] args) {
		String str = "Mr. Smith is here. This is a test! Isn't text mining "
				+ "amazing? Let's see e.g. this case."; 
		String[] sentences = detect(str); 
		System.out.println("Sentences splitted by OpenNLP: ");
		for (String sentence: sentences) {
			System.out.println(sentence); 
		}
	}
}

分词（Tokenization）

基于WhiteTokenizer、SimpleTokenizer和StanfordCoreNLP

Tokenizer.java

package cn.edu.bjut.chapter2;
 
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
 
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.WhitespaceTokenizer;
 
public class Tokenizer {
	public static String[] tokenizeByWhitespace(String text) {
		WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
		return tokenizer.tokenize(text); 
	}
 
	public static String[] tokenizeBySimple(String text) {
		SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
		return tokenizer.tokenize(text); 
	}
 
	public static List<CoreLabel> tokenizeByStanfordCoreNLP(String text) {
		// Set up the pipeline properties
		Properties props = new Properties();
		props.setProperty("annotators", "tokenize");
		props.setProperty("outputFormat", "text");
 
		// Build the pipeline
		StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
		// Create an annotation object
		Annotation annotation = new Annotation(text);
		// Annotate the text
		pipeline.annotate(annotation);
 
		// Retrieve the tokens
		List<CoreMap> sentences = 
				annotation.get(CoreAnnotations.SentencesAnnotation.class);
		List<CoreLabel> tokens = new ArrayList<CoreLabel>(); 
		for (CoreMap sentence : sentences) {
			tokens.addAll(sentence.get(
					CoreAnnotations.TokensAnnotation.class)); 
		}
 
		return tokens;
	}
 
	public static void main(String[] args) {
		String text = "Natural Language Processing (NLP) is essential "
				+ "for text analysis and text mining."; 
 
		String[] tokensByWhitespace = tokenizeByWhitespace(text); 
		System.out.println("Tokens tokenized by WhitespaceTokenizer: ");
		for (String token : tokensByWhitespace) {
			System.out.println(token);
		}
 
		String[] tokensBySimple = tokenizeBySimple(text); 
		System.out.println("Tokens tokenized by SimpleTokenizer: ");
		for (String token : tokensBySimple) {
			System.out.println(token);
		}
 
		List<CoreLabel> tokensByStanfordCoreNLP = 
				tokenizeByStanfordCoreNLP(text); 
		System.out.println("Tokens tokenized by StanfordCoreNLP: ");
		for (CoreLabel token : tokensByStanfordCoreNLP) {
			System.out.println(token.word());
		}
	}
}

基于OpenNLP

TokenizerByOpenNLP.java

package cn.edu.bjut.chapter2;
 
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
 
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
 
public class TokenizerByOpenNLP {
	private static TokenizerME tokenizer = null; 
	private static final String MODEL_FILE = 
			"resource/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin"; 
 
	private static void loadModel(String fname) {
		TokenizerModel model = null; 
		try {
			InputStream stream = new FileInputStream(fname); 
 
			try {
				model = new TokenizerModel(stream); 
			} catch (IOException e) {
				e.printStackTrace(); 
			} finally {
				if (stream != null) {
					try {
						stream.close(); 
					} catch (IOException e) {
						e.printStackTrace(); 
					}
				}
			}
 
			tokenizer = new TokenizerME(model); 
		} catch (FileNotFoundException e) {
			e.printStackTrace(); 
		}
	}
 
	public static String[] tokenize(String str) {
		if (tokenizer == null) {
			loadModel(MODEL_FILE); 
		}
 
		return tokenizer.tokenize(str); 
	}
 
	public static void main(String[] args) {
		String str = "Natural Language Processing (NLP) is essential "
				+ "for text analysis and text mining.";
		String[] tokens = tokenize(str); 
		System.out.println("Tokens tokenized by OpenNLP: ");
		for (int i = 0; i < tokens.length; i++) {
			System.out.println(tokens[i]); 
		}
	}
}

词性标注（Part-of-Speech）

基于Stanford CoreNLP

PoSTaggerByStanfordCoreNLP.java

package cn.edu.bjut.chapter2;
 
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
 
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
 
public class PoSTaggerByStanfordCoreNLP {
	public static List<CoreLabel> tag(String text) {
		// Set up the pipeline properties
		Properties props = new Properties();
		props.setProperty("annotators", "tokenize,ssplit,pos");
		props.setProperty("outputFormat", "text");
 
		// Build the pipeline
		StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
		// Create an annotation object
		Annotation annotation = new Annotation(text);
		// Annotate the text
		pipeline.annotate(annotation);
 
		// Retrieve the tokens
		List<CoreMap> sentences = annotation.get(
				CoreAnnotations.SentencesAnnotation.class);
		List<CoreLabel> tokens = new ArrayList<CoreLabel>();
		for (CoreMap sentence : sentences) {
			tokens.addAll(sentence.get(CoreAnnotations.TokensAnnotation.class));
		}
 
		return tokens;
	}
 
	public static void main(String[] args) {
		String text = "Natural Language Processing (NLP) is essential for text "
				+ "analysis and text mining.";
 
		List<CoreLabel> tokens = tag(text);
		for (CoreLabel token : tokens) {
			System.out.println(token.word() + ": " + token.tag());
		}
	}
}

基于OpenNLP

PoSTaggerByOpenNLP.java

package cn.edu.bjut.chapter2;
 
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
 
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
 
public class PoSTaggerByOpenNLP {
	private static POSTaggerME tagger = null;
	private static final String MODEL_FILE = 
			"resource/opennlp-en-ud-ewt-pos-1.0-1.9.3.bin";
 
	private static void loadModel(String fname) {
		POSModel model = null;
		try {
			InputStream stream = new FileInputStream(fname);
 
			try {
				model = new POSModel(stream);
			} catch (IOException e) {
				e.printStackTrace();
			} finally {
				if (stream != null) {
					try {
						stream.close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
 
			tagger = new POSTaggerME(model);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
	}
 
	public static String[] tag(String[] tokens) {
		if (tagger == null) {
			loadModel(MODEL_FILE);
		}
 
		return tagger.tag(tokens);
	}
 
	public static void main(String[] args) {
		String tokens[] = new String[] { "Most", "large", "cities", "in", "the", 
				"US", "had", "morning", "and", "afternoon", "newspapers", "." };
		String tags[] = tag(tokens);
 
		for (int i = 0; i < tags.length; i++) {
			System.out.println(tokens[i] + "\t" + tags[i]);
		}
	}
}

去词干（Stemming）

Stemmer.java

package cn.edu.bjut.chapter2;
 
import opennlp.tools.stemmer.PorterStemmer;
 
public class Stemmer {
	public static String stem(String token) {
		PorterStemmer stemmer = new PorterStemmer();
		return stemmer.stem(token);
	}
 
	public static void main(String[] args) {
		String[] words = { "running", "jumps", "better", "happily" };
 
		for (String word : words) {
			String stem = stem(word);
			System.out.println(word + "-->" + stem);
		}
	}
}

词形还原（Lemmatization）

Lemmatizer.java

package cn.edu.bjut.chapter2;
 
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
 
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
 
public class Lemmatizer {
	public static List<CoreLabel> tag(String text) {
		// Set up the pipeline properties
		Properties props = new Properties();
		props.setProperty("annotators", "tokenize,ssplit,pos,lemma");
		props.setProperty("outputFormat", "text");
 
		// Build the pipeline
		StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
		// Create an annotation object
		Annotation annotation = new Annotation(text);
		// Annotate the text
		pipeline.annotate(annotation);
 
		// Retrieve the tokens
		List<CoreMap> sentences = annotation.get(
				CoreAnnotations.SentencesAnnotation.class);
		List<CoreLabel> tokens = new ArrayList<CoreLabel>();
		for (CoreMap sentence : sentences) {
			tokens.addAll(sentence.get(CoreAnnotations.TokensAnnotation.class));
		}
 
		return tokens;
	}
 
	public static void main(String[] args) {
		String text = "Natural Language Processing (NLP) is essential "
				+ "for text analysis and text mining.";
 
		List<CoreLabel> tokens = tag(text);
		for (CoreLabel token : tokens) {
			System.out.println(token.word() + "-->" + token.lemma());
		}
	}
}

停用词过滤（Stopword Filtering）

StopwordChecker.java

package cn.edu.bjut.chapter2;
 
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
 
public class StopwordChecker {
	private Set<String> stopwords;
	private String fname = "resource/stoplist.txt";
	private boolean caseSensitive;
 
	public StopwordChecker(boolean caseSensitive) {
		this.stopwords = new HashSet<String>();
		this.caseSensitive = caseSensitive;
 
		load(this.fname);
	}
 
	public StopwordChecker() {
		this(false);
	}
 
	private void load(final String fname) {
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					new FileInputStream(fname), "UTF-8"));
 
			for (String line = null; (line = reader.readLine()) != null;) {
				line = line.trim();
				stopwords.add(caseSensitive ? line : line.toLowerCase());
			}
 
			reader.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
 
	public boolean check(String word) {
		return stopwords.contains(caseSensitive ? word : word.toLowerCase());
	}
 
	public static void main(String[] args) {
		StopwordChecker checker = new StopwordChecker();
 
		String[] tokens = { "Text", "analysis", "and", "text", "mining", 
				"are", "amazing", "!" };
		for (String token : tokens) {
			System.out.println(token + ": " + checker.check(token));
		}
	}
}

TF-IDF

Document.java

package cn.edu.bjut.chapter2;
 
import java.util.HashMap;
import java.util.Map;
 
public class Document {
	private String content;
	private Map<String, Double> tfMap;
	private StopwordChecker stopwordChecker;
 
	public Document(String content) {
		this.content = content;
		this.stopwordChecker = new StopwordChecker();
		calculateTermFrequency();
	}
 
	private void calculateTermFrequency() {
		this.tfMap = new HashMap<String, Double>();
 
		int total = 0;
		for (String sentence : SentenceSpliterByOpenNLP.detect(content)) {
			for (String token : TokenizerByOpenNLP.tokenize(sentence)) {
				if (!stopwordChecker.check(token)) {
					total++;
 
					token = token.toLowerCase();
					tfMap.put(token, tfMap.getOrDefault(token, 0.0) + 1);
				}
			}
		}
 
		for (String key : tfMap.keySet()) {
			tfMap.put(key, tfMap.get(key) / total);
		}
	}
 
	public Map<String, Double> getTermFrequency() {
		return tfMap;
	}
}

TFIDFAnalyzer.java

package cn.edu.bjut.chapter2;
 
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
 
public class TFIDFAnalyzer {
	private List<Document> documents;
 
	public TFIDFAnalyzer(List<Document> documents) {
		this.documents = documents;
	}
 
	public Map<String, Double> calculateIDF() {
		Map<String, Double> idf = new HashMap<String, Double>();
		int totalDocuments = documents.size();
 
		for (Document document : documents) {
			Set<String> tokens = document.getTermFrequency().keySet();
			for (String token : tokens) {
				idf.put(token, idf.getOrDefault(token, 0.0) + 1);
			}
		}
 
		for (String key : idf.keySet()) {
			idf.put(key, Math.log(totalDocuments / idf.get(key))); 
		}
		return idf;
	}
 
	public Map<String, Double> calculateTFIDF() {
		Map<String, Double> tfidf = new HashMap<String, Double>();
		Map<String, Double> idf = calculateIDF();
 
		for (Document document : documents) {
			Map<String, Double> tf = document.getTermFrequency();
			for (String term : tf.keySet()) {
				double tfidfValue = tf.get(term) * idf.getOrDefault(term, 0.0);
				tfidf.put(term, tfidf.getOrDefault(term, 0.0) + tfidfValue);
			}
		}
		return tfidf;
	}
 
	public static void main(String[] args) {
		List<Document> documents = new ArrayList<Document>();
		documents.add(new Document("A rose is very beautiful."));
		documents.add(new Document("A rose is a flower, which is a rose."));
 
		TFIDFAnalyzer tfidfAnalyzer = new TFIDFAnalyzer(documents);
		Map<String, Double> tfidf = tfidfAnalyzer.calculateTFIDF();
		for (Map.Entry<String, Double> entry : tfidf.entrySet()) {
			System.out.println(entry.getKey() + ": " + entry.getValue());
		}
	}
}

词嵌入（Word Embedding）

Word2VecExample.java

package cn.edu.bjut.chapter2;
 
import java.io.File;
import java.util.Arrays;
import java.util.Collection;
 
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.documentiterator.FileDocumentIterator;
 
public class Word2VecExample {
	public static void main(String[] args) throws Exception {
		// Step 1: Prepare the Dataset
		String trainFile = "data/train.txt";
		FileDocumentIterator iterator = new FileDocumentIterator(new File(trainFile));
 
		// Step 2: Build the Word2Vec Model
		Word2Vec word2Vec = new Word2Vec.Builder()
				.iterate(iterator).epochs(5)
				.layerSize(100).minWordFrequency(1)
				.seed(42).build();
 
		// Step 3: Fit the Model
		word2Vec.fit();
 
		// Get a word vector
		double[] vector = word2Vec.getWordVector("Java");
		System.out.println("Java's Vector: " + Arrays.toString(vector));
 
		// Find closest words to 'Java'
		System.out.println("Closest words to 'Java': ");
		Collection<String> nearestWords = word2Vec.wordsNearest("Java", 5);
		System.out.println(nearestWords);
	}
}

作业

给定一段文本（中文、英文均可），完成句子分割、分词、词干提取、词形还原、词性标注以及停用词过滤等操作。
1. 中文文本：文本挖掘涉及自然语言处理、模式分类和机器学习等多种技术，属于具有明确应用目标的多技术交叉研究领域。无论是前面介绍的数据预算和数据样本标，还是实现后面介绍的某些数据挖掘方法，通常需要用到很多基础性的技术和方法。
2. 英文文本：Latent Dirichlet Allocation (LDA) is a popular topic modeling technique for exploring document collections. Because of the increasing prevalence of large datasets, there is a need to improve the scalability of inference for LDA. In this paper, we introduce a novel and flexible large scale topic modeling package in MapReduce (Mr. LDA). As opposed to other techniques which use Gibbs sampling, our proposed framework uses variational inference, which easily fits into a distributed environment. More importantly, this variational implementation, unlike highly tuned and specialized implementations based on Gibbs sampling, is easily extensible. We demonstrate two extensions of the models possible with this scalable framework: informed priors to guide topic discovery and extracting topics from a multilingual corpus. We compare the scalability of Mr. LDA against Mahout, an existing large scale topic modeling package. Mr. LDA out-performs Mahout both in execution speed and held-out likelihood.

硕风徐徐

侧边栏

目录

第二章：文本表示

课件

相关资源

POM配置文件

句子切分

基于规则和Stanford CoreNLP

基于OpenNLP

分词（Tokenization）

基于WhiteTokenizer、SimpleTokenizer和StanfordCoreNLP

基于OpenNLP

词性标注（Part-of-Speech）

基于Stanford CoreNLP

基于OpenNLP

去词干（Stemming）

词形还原（Lemmatization）

停用词过滤（Stopword Filtering）

TF-IDF

词嵌入（Word Embedding）

作业

评论

硕风徐徐

用户工具

站点工具

侧边栏

目录

第二章：文本表示

课件

相关资源

POM配置文件

句子切分

基于规则和Stanford CoreNLP

基于OpenNLP

分词（Tokenization）

基于WhiteTokenizer、SimpleTokenizer和StanfordCoreNLP

基于OpenNLP

词性标注（Part-of-Speech）

基于Stanford CoreNLP

基于OpenNLP

去词干（Stemming）

词形还原（Lemmatization）

停用词过滤（Stopword Filtering）

TF-IDF

词嵌入（Word Embedding）

作业

评论

页面工具