package cn.edu.bjut.chapter4; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; import org.apache.commons.math3.ml.clustering.CentroidCluster; import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer; public class KMeansClustering { public static List> cluster(int k, List documents) { // Convert documents into points (this is a simplified version) List points = new ArrayList(); Random rand = new Random(0); for (String document : documents) { // Here we could represent document as points based on TF-IDF // For simplicity, we will use random values or replace with actual TF-IDF // vectors. double[] randomPoints = new double[] { rand.nextDouble(), rand.nextDouble() }; points.add(new DocumentPoint(randomPoints)); } KMeansPlusPlusClusterer kMeans = new KMeansPlusPlusClusterer(k); return kMeans.cluster(points); } public static void main(String[] args) { int k = 5; String[] documents = { "Java is a high-level, class-based, object-oriented programming language.", "Python is an interpreted, high-level and general-purpose programming language.", "JavaScript is a programming language that conforms to the ECMAScript specification.", "C++ is a general-purpose programming language created by Bjarne Stroustrup.", "PHP is a popular general-purpose scripting language that is especially suited to web development.", "Ruby is a dynamic, open source programming language with a focus on simplicity and productivity." }; List documentList = Arrays.asList(documents); List> clusters = KMeansClustering.cluster(k, documentList); for (CentroidCluster centroid : clusters) { List cluster = centroid.getPoints(); for (DocumentPoint doc : cluster) { System.out.println(doc); } System.out.println(); } } }