package cn.edu.bjut.chapter4; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; import org.apache.commons.math3.ml.clustering.Cluster; import org.apache.commons.math3.ml.clustering.DBSCANClusterer; public class DBSCANClustering { public static List> cluster(double epsilon, int minPoints, List documents) { // Suppose we have the points ready List points = new ArrayList(); Random rand = new Random(0); for (String document : documents) { // Here we could represent document as points based on TF-IDF // For simplicity, we will use random values or replace with actual TF-IDF // vectors. double[] randomPoints = new double[] { rand.nextDouble(), rand.nextDouble() }; points.add(new DocumentPoint(randomPoints)); } // Populate DocumentPoints from documents as before DBSCANClusterer dbscan = new DBSCANClusterer(epsilon, minPoints); return dbscan.cluster(points); } public static void main(String[] args) { int k = 5; String[] documents = { "Java is a high-level, class-based, object-oriented programming language.", "Python is an interpreted, high-level and general-purpose programming language.", "JavaScript is a programming language that conforms to the ECMAScript specification.", "C++ is a general-purpose programming language created by Bjarne Stroustrup.", "PHP is a popular general-purpose scripting language that is especially suited to web development.", "Ruby is a dynamic, open source programming language with a focus on simplicity and productivity." }; List documentList = Arrays.asList(documents); List> clusters = DBSCANClustering.cluster(0.5, 1, documentList); for (Cluster cluster : clusters) { for (DocumentPoint doc : cluster.getPoints()) { System.out.println(doc); } System.out.println(); } } }