下载:文本分类
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>cn.edu.bjut</groupId> <artifactId>text-mining</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>text-mining</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> <version>1.9.3</version> </dependency> <dependency> <groupId>edu.stanford.nlp</groupId> <artifactId>stanford-corenlp</artifactId> <version>4.5.7</version> </dependency> <dependency> <groupId>edu.stanford.nlp</groupId> <artifactId>stanford-corenlp</artifactId> <version>4.5.7</version> <classifier>models</classifier> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.12</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-text</artifactId> <version>1.9</version> </dependency> <dependency> <groupId>org.deeplearning4j</groupId> <artifactId>deeplearning4j-core</artifactId> <version>1.0.0-M1.1</version> </dependency> <dependency> <groupId>org.nd4j</groupId> <artifactId>nd4j-native-platform</artifactId> <version>1.0.0-M1.1</version> </dependency> <dependency> <groupId>org.deeplearning4j</groupId> <artifactId>deeplearning4j-nlp</artifactId> <version>1.0.0-M1.1</version> </dependency> <dependency> <groupId>cc.mallet</groupId> <artifactId>mallet</artifactId> <version>2.0.8</version> </dependency> </dependencies> </project>
MALLET_HOME:D:\Mallet-202108
> d: > cd Mallet-202108 > bin\mallet
参考:http://mallet.cs.umass.edu/classification.php
> bin\mallet import-dir --input sample-data/web/* --output web.mallet
> bin\mallet train-classifier --input web.mallet --trainer NaiveBayes --output-classifier web.classifier
分类器可设置为:NaiveBayes,MaxEnt和DecisionTree等
> bin\mallet train-classifier --input web.mallet --trainer NaiveBayes --output-classifier web.classifier --training-portion 0.8
> bin\mallet train-classifier --input web.mallet --trainer NaiveBayes --output-classifier web.classifier --cross-validation 10
参考:Data Import for Java Developers,Document Classification Developer's Guide
package cn.edu.bjut.chapter3; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.regex.Pattern; import cc.mallet.classify.Classifier; import cc.mallet.classify.ClassifierTrainer; import cc.mallet.classify.MaxEnt; import cc.mallet.classify.MaxEntTrainer; import cc.mallet.classify.Trial; import cc.mallet.pipe.CharSequence2TokenSequence; import cc.mallet.pipe.FeatureSequence2FeatureVector; import cc.mallet.pipe.Input2CharSequence; import cc.mallet.pipe.Pipe; import cc.mallet.pipe.SerialPipes; import cc.mallet.pipe.Target2Label; import cc.mallet.pipe.TokenSequence2FeatureSequence; import cc.mallet.pipe.TokenSequenceLowercase; import cc.mallet.pipe.TokenSequenceRemoveStopwords; import cc.mallet.pipe.iterator.FileIterator; import cc.mallet.types.InstanceList; import cc.mallet.types.Label; import cc.mallet.util.Randoms; public class TextClassifierByMallet { private Pipe pipe; public TextClassifierByMallet() { this.pipe = buildPipe(); } private Pipe buildPipe() { ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); // Read data from File objects pipeList.add(new Input2CharSequence("UTF-8")); // Regular expression for what constitutes a token. // This pattern includes Unicode letters, Unicode numbers, // and the underscore character. Alternatives: // "\\S+" (anything not whitespace) // "\\w+" ( A-Z, a-z, 0-9, _ ) // "[\\p{L}\\p{N}_]+|[\\p{P}]+" (a group of only letters and numbers OR // a group of only punctuation marks) Pattern tokenPattern = Pattern.compile("[\\p{L}\\p{N}_]+"); // Tokenize raw strings pipeList.add(new CharSequence2TokenSequence(tokenPattern)); // Normalize all tokens to all lowercase pipeList.add(new TokenSequenceLowercase()); // Remove stopwords from a standard English stoplist. // options: [case sensitive] [mark deletions] pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // Rather than storing tokens as strings, convert // them to integers by looking them up in an alphabet. pipeList.add(new TokenSequence2FeatureSequence()); // Do the same thing for the "target" field: // convert a class label string to a Label object, // which has an index in a Label alphabet. pipeList.add(new Target2Label()); // Now convert the sequence of features to a sparse vector, // mapping feature IDs to counts. pipeList.add(new FeatureSequence2FeatureVector()); // Print out the features and the label // pipeList.add(new PrintInputAndTarget()); return new SerialPipes(pipeList); } public InstanceList readDirectory(File directory) { return readDirectories(new File[] { directory }); } public InstanceList readDirectories(File[] directories) { // Construct a file iterator, starting with the // specified directories, and recursing through sub-directories. // The second argument specifies a FileFilter to use to select // files within a directory. // The third argument is a Pattern that is applied to the // filename to produce a class label. In this case, I've // asked it to use the last directory name in the path. //FileIterator iterator = new FileIterator(directories, new TxtFilter(), FileIterator.LAST_DIRECTORY); FileIterator iterator = new FileIterator(directories, null, FileIterator.LAST_DIRECTORY); // Construct a new instance list, passing it the pipe // we want to use to process instances. InstanceList instances = new InstanceList(pipe); // Now process each instance provided by the iterator. instances.addThruPipe(iterator); return instances; } public Classifier trainClassifier(InstanceList trainingInstances) { // Here we use a maximum entropy (ie polytomous logistic regression) // classifier. Mallet includes a wide variety of classification // algorithms, see the JavaDoc API for details. ClassifierTrainer<MaxEnt> trainer = new MaxEntTrainer(); return trainer.train(trainingInstances); } public Classifier loadClassifier(File serializedFile) throws FileNotFoundException, IOException, ClassNotFoundException { // The standard way to save classifiers and Mallet data // for repeated use is through Java serialization. // Here we load a serialized classifier from a file. Classifier classifier; ObjectInputStream ois = new ObjectInputStream(new FileInputStream(serializedFile)); classifier = (Classifier) ois.readObject(); ois.close(); return classifier; } public void saveClassifier(Classifier classifier, File serializedFile) throws IOException { // The standard method for saving classifiers in // Mallet is through Java serialization. Here we // write the classifier object to the specified file. ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(serializedFile)); oos.writeObject(classifier); oos.close(); } public Trial testTrainSplit(InstanceList instances) { int TRAINING = 0; int TESTING = 1; //int VALIDATION = 2; // Split the input list into training (90%) and testing (10%) lists. // The division takes place by creating a copy of the list, // randomly shuffling the copy, and then allocating // instances to each sub-list based on the provided proportions. InstanceList[] instanceLists = instances.split(new Randoms(), new double[] { 0.9, 0.1, 0.0 }); // The third position is for the "validation" set, // which is a set of instances not used directly // for training, but available for determining // when to stop training and for estimating optimal // settings of nuisance parameters. // Most Mallet ClassifierTrainers can not currently take advantage // of validation sets. Classifier classifier = trainClassifier(instanceLists[TRAINING]); return new Trial(classifier, instanceLists[TESTING]); } public static void main(String[] args) throws IOException { String dirName = "data/20newsgroups"; String outFileName = "data/20newsgroups.mallet"; TextClassifierByMallet importer = new TextClassifierByMallet(); InstanceList instances = importer.readDirectory(new File(dirName)); instances.save(new File(outFileName)); String modelFileName = "data/20newsgroups.classifier"; Classifier classifier = importer.trainClassifier(instances); importer.saveClassifier(classifier, new File(modelFileName)); Trial trial = importer.testTrainSplit(instances); System.out.println("#of labels: " + classifier.getLabelAlphabet().size()); for (int idx = 0; idx < classifier.getLabelAlphabet().size(); idx++) { Label label = classifier.getLabelAlphabet().lookupLabel(idx); System.out.println(label.toString() + ": "+ trial.getF1(idx)); } } }
运行时如果出现以下错误揭示:
Couldn't open cc.mallet.util.MalletLogger resources/logging.properties file.
Perhaps the 'resources' directories weren't copied into the 'class' directory.
Continuing.
创建目录src/main/resources/cc/mallet/util/resources/,并把文件logging.properties 拷贝到src/main/resources/cc/mallet/util/resources/目录中。
评论