用户工具

站点工具


zh:courses:textmining2025:ch04

第四章:文本聚类

课件

下载:文本聚类

相关资源

POM配置文件

pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
 
	<groupId>cn.edu.bjut</groupId>
	<artifactId>text-mining</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>
 
	<name>text-mining</name>
	<url>http://maven.apache.org</url>
 
	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	</properties>
 
	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>3.8.1</version>
			<scope>test</scope>
		</dependency>
 
		<dependency>
			<groupId>org.apache.opennlp</groupId>
			<artifactId>opennlp-tools</artifactId>
			<version>1.9.3</version>
		</dependency>
 
		<dependency>
			<groupId>edu.stanford.nlp</groupId>
			<artifactId>stanford-corenlp</artifactId>
			<version>4.5.7</version>
		</dependency>
 
		<dependency>
			<groupId>edu.stanford.nlp</groupId>
			<artifactId>stanford-corenlp</artifactId>
			<version>4.5.7</version>
			<classifier>models</classifier>
		</dependency>
 
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>1.7.12</version>
		</dependency>
 
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-text</artifactId>
			<version>1.9</version>
		</dependency>
 
		<dependency>
			<groupId>org.deeplearning4j</groupId>
			<artifactId>deeplearning4j-core</artifactId>
			<version>1.0.0-M1.1</version>
		</dependency>
 
		<dependency>
			<groupId>org.nd4j</groupId>
			<artifactId>nd4j-native-platform</artifactId>
			<version>1.0.0-M1.1</version>
		</dependency>
 
		<dependency>
			<groupId>org.deeplearning4j</groupId>
			<artifactId>deeplearning4j-nlp</artifactId>
			<version>1.0.0-M1.1</version>
		</dependency>
 
		<dependency>
			<groupId>cc.mallet</groupId>
			<artifactId>mallet</artifactId>
			<version>2.0.8</version>
		</dependency>
 
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>3.12.0</version>
		</dependency>
	</dependencies>
</project>

DocumentPoint类

DocumentPoint.java
package cn.edu.bjut.chapter4;
 
import org.apache.commons.math3.ml.clustering.Clusterable;
 
public class DocumentPoint implements Clusterable {
	private double[] points; 
 
	public DocumentPoint(double[] points) {
		this.points = points;
	}
 
	public double[] getPoint() {
		return this.points;
	}
 
	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder(); 
 
		for (int i = 0; i < points.length; i++) {
			if (Math.abs(points[i]) < 1e-5) {
				continue; 
			}
 
			sb.append((i + 1) + ":" + points[i] + " "); 
		}
 
		return sb.toString(); 
	}
}

KMeansClustering类

KMeansClustering.java
package cn.edu.bjut.chapter4;
 
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
 
import org.apache.commons.math3.ml.clustering.CentroidCluster;
import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer;
 
public class KMeansClustering {
	public static List<CentroidCluster<DocumentPoint>> cluster(int k, List<String> documents) {
		// Convert documents into points (this is a simplified version)
		List<DocumentPoint> points = new ArrayList<DocumentPoint>();
		Random rand = new Random(0);
		for (String document : documents) {
			// Here we could represent document as points based on TF-IDF
			// For simplicity, we will use random values or replace with actual TF-IDF
			// vectors.
			double[] randomPoints = new double[] { rand.nextDouble(), rand.nextDouble() };
			points.add(new DocumentPoint(randomPoints));
		}
 
		KMeansPlusPlusClusterer<DocumentPoint> kMeans = new KMeansPlusPlusClusterer<DocumentPoint>(k);
		return kMeans.cluster(points);
	}
 
	public static void main(String[] args) {
		int k = 5;
		String[] documents = { "Java is a high-level, class-based, object-oriented programming language.",
				"Python is an interpreted, high-level and general-purpose programming language.",
				"JavaScript is a programming language that conforms to the ECMAScript specification.",
				"C++ is a general-purpose programming language created by Bjarne Stroustrup.",
				"PHP is a popular general-purpose scripting language that is especially suited to web development.",
				"Ruby is a dynamic, open source programming language with a focus on simplicity and productivity." };
		List<String> documentList = Arrays.asList(documents);
 
		List<CentroidCluster<DocumentPoint>> clusters = KMeansClustering.cluster(k, documentList);
		for (CentroidCluster<DocumentPoint> centroid : clusters) {
			List<DocumentPoint> cluster = centroid.getPoints();
			for (DocumentPoint doc : cluster) {
				System.out.println(doc);
			}
 
			System.out.println();
		}
	}
}

DBSCANClustering类

DBSCANClustering.java
package cn.edu.bjut.chapter4;
 
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
 
import org.apache.commons.math3.ml.clustering.Cluster;
import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
 
public class DBSCANClustering {
	public static List<Cluster<DocumentPoint>> cluster(double epsilon, int minPoints, List<String> documents) {
		// Suppose we have the points ready
		List<DocumentPoint> points = new ArrayList<DocumentPoint>();
		Random rand = new Random(0);
		for (String document : documents) {
			// Here we could represent document as points based on TF-IDF
			// For simplicity, we will use random values or replace with actual TF-IDF
			// vectors.
			double[] randomPoints = new double[] { rand.nextDouble(), rand.nextDouble() };
			points.add(new DocumentPoint(randomPoints));
		}
 
		// Populate DocumentPoints from documents as before
		DBSCANClusterer<DocumentPoint> dbscan = new DBSCANClusterer<DocumentPoint>(epsilon, minPoints);
		return dbscan.cluster(points);
	}
 
	public static void main(String[] args) {
		int k = 5;
		String[] documents = { "Java is a high-level, class-based, object-oriented programming language.",
				"Python is an interpreted, high-level and general-purpose programming language.",
				"JavaScript is a programming language that conforms to the ECMAScript specification.",
				"C++ is a general-purpose programming language created by Bjarne Stroustrup.",
				"PHP is a popular general-purpose scripting language that is especially suited to web development.",
				"Ruby is a dynamic, open source programming language with a focus on simplicity and productivity." };
		List<String> documentList = Arrays.asList(documents);
 
		List<Cluster<DocumentPoint>> clusters = DBSCANClustering.cluster(0.5, 1, documentList);
		for (Cluster<DocumentPoint> cluster : clusters) {
			for (DocumentPoint doc : cluster.getPoints()) {
				System.out.println(doc);
			}
 
			System.out.println();
		}
	}
}

Affinity Propagation (AP)聚类

APClustering.java
package cn.edu.bjut.chapter5;
 
import java.util.Random;
 
import fr.lri.tao.apro.ap.Apro;
import fr.lri.tao.apro.ap.AproBuilder;
import fr.lri.tao.apro.data.DataProvider;
import fr.lri.tao.apro.data.MatrixProvider;
 
public class APClustering {
	public static void main(String[] args) {
		Random rand = new Random(20241015); 
 
		double[][] similarity = new double[50][50]; 
		for (int i = 0; i < similarity.length - 1; i++) {
			similarity[i][i] = 10.0; 
			for (int j = i + 1; j < similarity[i].length; j++) {
				similarity[i][j] = similarity[j][i] = rand.nextDouble(); 
			}
		}
 
		DataProvider provider = new MatrixProvider(similarity);
		AproBuilder builder = new AproBuilder(); 
		builder.setFullAuto(); 
		Apro apro = builder.build(provider);
		apro.run(200);
 
		int[] exemplars = apro.getExemplars(); 
		for (int nodeId = 0; nodeId < exemplars.length; nodeId++) {
			System.out.println(nodeId + ": " + exemplars[nodeId]); 
		}
	}
}

相似度计算

SimilarityCalculator.java
package cn.edu.bjut.chapter5;
 
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
 
public class SimilarityCalculator {
	public static double calculateCosineSimilarity(String text1, String text2) {
		Map<String, Integer> vector1 = getWordFrequency(text1);
		Map<String, Integer> vector2 = getWordFrequency(text2);
 
		double dotProduct = 0.0;
		for (String word : vector1.keySet()) {
			if (vector2.containsKey(word)) {
				dotProduct += vector1.get(word) * vector2.get(word);
			}
		}
 
		double magnitude1 = calculateMagnitude(vector1);
		double magnitude2 = calculateMagnitude(vector2);
 
		return dotProduct / (magnitude1 * magnitude2);
	}
 
	private static Map<String, Integer> getWordFrequency(String text) {
		String[] words = text.toLowerCase().split("\\s+");
		Map<String, Integer> wordCount = new HashMap<String, Integer>();
		for (String word : words) {
			wordCount.put(word, wordCount.getOrDefault(word, 0) + 1);
		}
 
		return wordCount;
	}
 
	private static double calculateMagnitude(Map<String, Integer> vector) {
		double sum = 0.0;
		for (int value : vector.values()) {
			sum += value * value;
		}
 
		return Math.sqrt(sum);
	}
 
	public static double calculateJaccardSimilarity(String text1, String text2) {
		Set<String> set1 = new HashSet<String>(Arrays.asList(text1.split(",\\s*")));
		Set<String> set2 = new HashSet<String>(Arrays.asList(text2.split(",\\s*")));
 
		Set<String> intersection = new HashSet<String>(set1);
		intersection.retainAll(set2);
 
		Set<String> union = new HashSet<String>(set1);
		union.addAll(set2);
 
		return (double) intersection.size() / union.size();
	}
 
	public static int calculateLevenshteinDistance(String str1, String str2) {
		int[][] dp = new int[str1.length() + 1][str2.length() + 1];
 
		for (int i = 0; i <= str1.length(); i++) {
			for (int j = 0; j <= str2.length(); j++) {
				if (i == 0) {
					dp[i][j] = j;
				} else if (j == 0) {
					dp[i][j] = i;
				} else {
					int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) 
							? 0 : 1;
					dp[i][j] = Math.min(Math.min(dp[i - 1][j] + 1, 
							dp[i][j - 1] + 1), 
							dp[i - 1][j - 1] + cost);
				}
			}
		}
 
		return dp[str1.length()][str2.length()];
	}
 
	public static void main(String[] args) {
		{
			String text1 = "I love programming in Java";
			String text2 = "Java programming is amazing";
 
			double similarity = calculateCosineSimilarity(text1, text2);
			System.out.println("Cosine Similarity: " + similarity);
		}
 
		{
			String text1 = "cat, dog, mouse";
			String text2 = "dog, elephant";
 
			double similarity = calculateJaccardSimilarity(text1, text2);
			System.out.println("Jaccard Similarity: " + similarity);
		}
 
		{
			String str1 = "kitten";
			String str2 = "sitting";
 
			int distance = calculateLevenshteinDistance(str1, str2);
			System.out.println("Levenshtein Distance: " + distance);
		}
	}
}

作业

  1. 修改KMeansClustering类、DBSCANClusteringAPClustering类,对文本内容进行适当预处理,将其转换为TF-IDF特征向量,然后进行聚类分析。
  2. 自行查找谱聚类的Java实现,完成跨领域产品评论数据的谱聚类分析,其中两个领域的产品评论数据(中英文均可)大家自行搜寻。

评论

请输入您的评论. 可以使用维基语法:
103 +2 = 
 
zh/courses/textmining2025/ch04.txt · 最后更改: 2025/10/14 09:43 由 pzczxs