下载:文本聚类
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>cn.edu.bjut</groupId> <artifactId>text-mining</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>text-mining</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> <version>1.9.3</version> </dependency> <dependency> <groupId>edu.stanford.nlp</groupId> <artifactId>stanford-corenlp</artifactId> <version>4.5.7</version> </dependency> <dependency> <groupId>edu.stanford.nlp</groupId> <artifactId>stanford-corenlp</artifactId> <version>4.5.7</version> <classifier>models</classifier> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.12</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-text</artifactId> <version>1.9</version> </dependency> <dependency> <groupId>org.deeplearning4j</groupId> <artifactId>deeplearning4j-core</artifactId> <version>1.0.0-M1.1</version> </dependency> <dependency> <groupId>org.nd4j</groupId> <artifactId>nd4j-native-platform</artifactId> <version>1.0.0-M1.1</version> </dependency> <dependency> <groupId>org.deeplearning4j</groupId> <artifactId>deeplearning4j-nlp</artifactId> <version>1.0.0-M1.1</version> </dependency> <dependency> <groupId>cc.mallet</groupId> <artifactId>mallet</artifactId> <version>2.0.8</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.12.0</version> </dependency> </dependencies> </project>
package cn.edu.bjut.chapter4; import org.apache.commons.math3.ml.clustering.Clusterable; public class DocumentPoint implements Clusterable { private double[] points; public DocumentPoint(double[] points) { this.points = points; } public double[] getPoint() { return this.points; } @Override public String toString() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < points.length; i++) { if (Math.abs(points[i]) < 1e-5) { continue; } sb.append((i + 1) + ":" + points[i] + " "); } return sb.toString(); } }
package cn.edu.bjut.chapter4; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; import org.apache.commons.math3.ml.clustering.CentroidCluster; import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer; public class KMeansClustering { public static List<CentroidCluster<DocumentPoint>> cluster(int k, List<String> documents) { // Convert documents into points (this is a simplified version) List<DocumentPoint> points = new ArrayList<DocumentPoint>(); Random rand = new Random(0); for (String document : documents) { // Here we could represent document as points based on TF-IDF // For simplicity, we will use random values or replace with actual TF-IDF // vectors. double[] randomPoints = new double[] { rand.nextDouble(), rand.nextDouble() }; points.add(new DocumentPoint(randomPoints)); } KMeansPlusPlusClusterer<DocumentPoint> kMeans = new KMeansPlusPlusClusterer<DocumentPoint>(k); return kMeans.cluster(points); } public static void main(String[] args) { int k = 5; String[] documents = { "Java is a high-level, class-based, object-oriented programming language.", "Python is an interpreted, high-level and general-purpose programming language.", "JavaScript is a programming language that conforms to the ECMAScript specification.", "C++ is a general-purpose programming language created by Bjarne Stroustrup.", "PHP is a popular general-purpose scripting language that is especially suited to web development.", "Ruby is a dynamic, open source programming language with a focus on simplicity and productivity." }; List<String> documentList = Arrays.asList(documents); List<CentroidCluster<DocumentPoint>> clusters = KMeansClustering.cluster(k, documentList); for (CentroidCluster<DocumentPoint> centroid : clusters) { List<DocumentPoint> cluster = centroid.getPoints(); for (DocumentPoint doc : cluster) { System.out.println(doc); } System.out.println(); } } }
package cn.edu.bjut.chapter4; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; import org.apache.commons.math3.ml.clustering.Cluster; import org.apache.commons.math3.ml.clustering.DBSCANClusterer; public class DBSCANClustering { public static List<Cluster<DocumentPoint>> cluster(double epsilon, int minPoints, List<String> documents) { // Suppose we have the points ready List<DocumentPoint> points = new ArrayList<DocumentPoint>(); Random rand = new Random(0); for (String document : documents) { // Here we could represent document as points based on TF-IDF // For simplicity, we will use random values or replace with actual TF-IDF // vectors. double[] randomPoints = new double[] { rand.nextDouble(), rand.nextDouble() }; points.add(new DocumentPoint(randomPoints)); } // Populate DocumentPoints from documents as before DBSCANClusterer<DocumentPoint> dbscan = new DBSCANClusterer<DocumentPoint>(epsilon, minPoints); return dbscan.cluster(points); } public static void main(String[] args) { int k = 5; String[] documents = { "Java is a high-level, class-based, object-oriented programming language.", "Python is an interpreted, high-level and general-purpose programming language.", "JavaScript is a programming language that conforms to the ECMAScript specification.", "C++ is a general-purpose programming language created by Bjarne Stroustrup.", "PHP is a popular general-purpose scripting language that is especially suited to web development.", "Ruby is a dynamic, open source programming language with a focus on simplicity and productivity." }; List<String> documentList = Arrays.asList(documents); List<Cluster<DocumentPoint>> clusters = DBSCANClustering.cluster(0.5, 1, documentList); for (Cluster<DocumentPoint> cluster : clusters) { for (DocumentPoint doc : cluster.getPoints()) { System.out.println(doc); } System.out.println(); } } }
package cn.edu.bjut.chapter5; import java.util.Random; import fr.lri.tao.apro.ap.Apro; import fr.lri.tao.apro.ap.AproBuilder; import fr.lri.tao.apro.data.DataProvider; import fr.lri.tao.apro.data.MatrixProvider; public class APClustering { public static void main(String[] args) { Random rand = new Random(20241015); double[][] similarity = new double[50][50]; for (int i = 0; i < similarity.length - 1; i++) { similarity[i][i] = 10.0; for (int j = i + 1; j < similarity[i].length; j++) { similarity[i][j] = similarity[j][i] = rand.nextDouble(); } } DataProvider provider = new MatrixProvider(similarity); AproBuilder builder = new AproBuilder(); builder.setFullAuto(); Apro apro = builder.build(provider); apro.run(200); int[] exemplars = apro.getExemplars(); for (int nodeId = 0; nodeId < exemplars.length; nodeId++) { System.out.println(nodeId + ": " + exemplars[nodeId]); } } }
package cn.edu.bjut.chapter5; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; public class SimilarityCalculator { public static double calculateCosineSimilarity(String text1, String text2) { Map<String, Integer> vector1 = getWordFrequency(text1); Map<String, Integer> vector2 = getWordFrequency(text2); double dotProduct = 0.0; for (String word : vector1.keySet()) { if (vector2.containsKey(word)) { dotProduct += vector1.get(word) * vector2.get(word); } } double magnitude1 = calculateMagnitude(vector1); double magnitude2 = calculateMagnitude(vector2); return dotProduct / (magnitude1 * magnitude2); } private static Map<String, Integer> getWordFrequency(String text) { String[] words = text.toLowerCase().split("\\s+"); Map<String, Integer> wordCount = new HashMap<String, Integer>(); for (String word : words) { wordCount.put(word, wordCount.getOrDefault(word, 0) + 1); } return wordCount; } private static double calculateMagnitude(Map<String, Integer> vector) { double sum = 0.0; for (int value : vector.values()) { sum += value * value; } return Math.sqrt(sum); } public static double calculateJaccardSimilarity(String text1, String text2) { Set<String> set1 = new HashSet<String>(Arrays.asList(text1.split(",\\s*"))); Set<String> set2 = new HashSet<String>(Arrays.asList(text2.split(",\\s*"))); Set<String> intersection = new HashSet<String>(set1); intersection.retainAll(set2); Set<String> union = new HashSet<String>(set1); union.addAll(set2); return (double) intersection.size() / union.size(); } public static int calculateLevenshteinDistance(String str1, String str2) { int[][] dp = new int[str1.length() + 1][str2.length() + 1]; for (int i = 0; i <= str1.length(); i++) { for (int j = 0; j <= str2.length(); j++) { if (i == 0) { dp[i][j] = j; } else if (j == 0) { dp[i][j] = i; } else { int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1; dp[i][j] = Math.min(Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1), dp[i - 1][j - 1] + cost); } } } return dp[str1.length()][str2.length()]; } public static void main(String[] args) { { String text1 = "I love programming in Java"; String text2 = "Java programming is amazing"; double similarity = calculateCosineSimilarity(text1, text2); System.out.println("Cosine Similarity: " + similarity); } { String text1 = "cat, dog, mouse"; String text2 = "dog, elephant"; double similarity = calculateJaccardSimilarity(text1, text2); System.out.println("Jaccard Similarity: " + similarity); } { String str1 = "kitten"; String str2 = "sitting"; int distance = calculateLevenshteinDistance(str1, str2); System.out.println("Levenshtein Distance: " + distance); } } }
评论