/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.util;

import cc.mallet.pipe.Noop;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureSequence;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CommandOption;
import cc.mallet.util.FeatureCountTool;
import cc.mallet.util.MalletLogger;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.BitSet;
import java.util.logging.Logger;

public class VocabularyPruner {
    private static Logger logger = MalletLogger.getLogger(VocabularyPruner.class.getName());
    static CommandOption.File inputFile = new CommandOption.File(VocabularyPruner.class, "input", "FILE", true, new File("-"), "Read the instance list from this file; Using - indicates stdin.", null);
    static CommandOption.File outputFile = new CommandOption.File(VocabularyPruner.class, "output", "FILE", true, new File("-"), "Write pruned instance list to this file. Using - indicates stdout.", null);
    static CommandOption.Integer pruneCount = new CommandOption.Integer(VocabularyPruner.class, "prune-count", "N", false, 0, "Reduce features to those that occur more than N times.", null);
    static CommandOption.Integer pruneDocFreq = new CommandOption.Integer(VocabularyPruner.class, "prune-document-freq", "N", false, 0, "Reduce features to those that occur in more than N contexts.", null);
    static CommandOption.Double minIDF = new CommandOption.Double(VocabularyPruner.class, "min-idf", "NUMBER", false, 0.0, "Remove (common) features with inverse document frequency less than this value (3.0 = 5% of docs).", null);
    static CommandOption.Double maxIDF = new CommandOption.Double(VocabularyPruner.class, "max-idf", "NUMBER", false, Double.POSITIVE_INFINITY, "Remove (rare) features with inverse document frequency greater than this value (10 = one in 25k docs).", null);

    public static void main(String[] args) throws FileNotFoundException, IOException {
        CommandOption.setSummary(VocabularyPruner.class, "A tool for pruning features from instance lists.");
        CommandOption.process(VocabularyPruner.class, args);
        if (args.length == 0) {
            CommandOption.getList(VocabularyPruner.class).printUsage(false);
            System.exit(-1);
        }
        InstanceList instances = InstanceList.load(VocabularyPruner.inputFile.value);
        FeatureCountTool counter = new FeatureCountTool(instances);
        counter.count();
        double[] tokenCounts = counter.getFeatureCounts();
        int[] documentCounts = counter.getDocumentFrequencies();
        int minTokens = pruneCount.value();
        double minDocs = Math.max((double)pruneDocFreq.value(), Math.exp(-VocabularyPruner.maxIDF.value) * (double)instances.size());
        double maxDocs = Math.min((double)instances.size(), Math.exp(-VocabularyPruner.minIDF.value) * (double)instances.size());
        Alphabet oldAlphabet = instances.getDataAlphabet();
        Alphabet newAlphabet = new Alphabet();
        int[] newIDs = new int[oldAlphabet.size()];
        BitSet bs = new BitSet(oldAlphabet.size());
        for (int feature = 0; feature < newIDs.length; ++feature) {
            if (tokenCounts[feature] > (double)minTokens && (double)documentCounts[feature] > minDocs && (double)documentCounts[feature] < maxDocs) {
                newIDs[feature] = newAlphabet.lookupIndex(oldAlphabet.lookupObject(feature));
                bs.set(feature);
                continue;
            }
            newIDs[feature] = -1;
        }
        logger.info("features: " + oldAlphabet.size() + " -> " + newAlphabet.size());
        Noop newPipe = new Noop(newAlphabet, instances.getTargetAlphabet());
        InstanceList newInstanceList = new InstanceList(newPipe);
        Instance firstInstance = (Instance)instances.get(0);
        if (firstInstance.getData() instanceof FeatureSequence) {
            while (instances.size() > 0) {
                Instance instance = (Instance)instances.get(0);
                FeatureSequence fs = (FeatureSequence)instance.getData();
                int[] originalFeatures = fs.getFeatures();
                int newLength = 0;
                for (int i = 0; i < fs.getLength(); ++i) {
                    int feature = originalFeatures[i];
                    if (newIDs[feature] == -1) continue;
                    ++newLength;
                }
                int[] newFeatures = new int[newLength];
                int newIndex = 0;
                for (int i = 0; i < fs.getLength(); ++i) {
                    int feature = originalFeatures[i];
                    if (newIDs[feature] == -1) continue;
                    newFeatures[newIndex] = newIDs[feature];
                    ++newIndex;
                }
                newInstanceList.add(newPipe.instanceFrom(new Instance(new FeatureSequence(newAlphabet, newFeatures), instance.getTarget(), instance.getName(), instance.getSource())));
                instances.remove(0);
            }
        } else if (firstInstance.getData() instanceof FeatureVector) {
            FeatureSelection fs = new FeatureSelection(oldAlphabet, bs);
            for (int ii = 0; ii < instances.size(); ++ii) {
                Instance instance = (Instance)instances.get(ii);
                FeatureVector fv = (FeatureVector)instance.getData();
                FeatureVector fv2 = FeatureVector.newFeatureVector(fv, newAlphabet, fs);
                newInstanceList.add(new Instance(fv2, instance.getTarget(), instance.getName(), instance.getSource()), instances.getInstanceWeight(ii));
                instance.unLock();
                instance.setData(null);
            }
        }
        logger.info("Writing instance list to " + VocabularyPruner.outputFile.value);
        newInstanceList.save(VocabularyPruner.outputFile.value);
    }
}

