/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.vectorizer;

import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
import com.google.common.io.Closeables;
import java.io.Closeable;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.DictionaryVectorizer;
import org.apache.mahout.vectorizer.DocumentProcessor;
import org.apache.mahout.vectorizer.RandomDocumentGenerator;
import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
import org.junit.Before;
import org.junit.Test;

@ThreadLeakScope(value=ThreadLeakScope.Scope.NONE)
public final class DictionaryVectorizerTest
extends MahoutTestCase {
    private static final int NUM_DOCS = 100;
    private static final String SECOND_TEXT_BLOCK_IDENTIFIER = "2NDBLOCK";
    private Path inputPath;

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    @Before
    public void setUp() throws Exception {
        super.setUp();
        Configuration conf = this.getConfiguration();
        this.inputPath = this.getTestTempFilePath("documents/docs.file");
        FileSystem fs = FileSystem.get((URI)this.inputPath.toUri(), (Configuration)conf);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, this.inputPath, Text.class, Text.class);
        try {
            RandomDocumentGenerator gen = new RandomDocumentGenerator();
            for (int i = 0; i < 100; ++i) {
                writer.append((Writable)new Text("Document::ID::" + i), (Writable)new Text(gen.getRandomDocument()));
                writer.append((Writable)new Text("Document::ID::" + i), (Writable)new Text(SECOND_TEXT_BLOCK_IDENTIFIER));
            }
        }
        finally {
            Closeables.close((Closeable)writer, (boolean)false);
        }
    }

    @Test
    public void testCreateTermFrequencyVectors() throws Exception {
        this.runTest(false, false);
    }

    @Test
    public void testCreateTermFrequencyVectorsNam() throws Exception {
        this.runTest(false, true);
    }

    @Test
    public void testCreateTermFrequencyVectorsSeq() throws Exception {
        this.runTest(true, false);
    }

    @Test
    public void testCreateTermFrequencyVectorsSeqNam() throws Exception {
        this.runTest(true, true);
    }

    private void runTest(boolean sequential, boolean named) throws IOException, ClassNotFoundException, InterruptedException {
        Class<StandardAnalyzer> analyzer = StandardAnalyzer.class;
        Path tokenizedDocuments = this.getTestTempDirPath("output/tokenized-documents");
        Path wordCount = this.getTestTempDirPath("output/wordcount");
        Path tfVectors = new Path(wordCount, "tf-vectors");
        Path tfidf = this.getTestTempDirPath("output/tfidf");
        Path tfidfVectors = new Path(tfidf, "tfidf-vectors");
        Configuration conf = this.getConfiguration();
        DocumentProcessor.tokenizeDocuments((Path)this.inputPath, analyzer, (Path)tokenizedDocuments, (Configuration)conf);
        DictionaryVectorizer.createTermFrequencyVectors((Path)tokenizedDocuments, (Path)wordCount, (String)"tf-vectors", (Configuration)conf, (int)2, (int)1, (float)0.0f, (float)-1.0f, (boolean)true, (int)1, (int)100, (boolean)sequential, (boolean)named);
        DictionaryVectorizerTest.validateVectors(conf, 100, tfVectors, sequential, named);
        Pair docFrequenciesFeatures = TFIDFConverter.calculateDF((Path)tfVectors, (Path)tfidf, (Configuration)conf, (int)100);
        TFIDFConverter.processTfIdf((Path)tfVectors, (Path)tfidf, (Configuration)conf, (Pair)docFrequenciesFeatures, (int)1, (long)-1L, (float)2.0f, (boolean)false, (boolean)sequential, (boolean)named, (int)1);
        DictionaryVectorizerTest.validateVectors(conf, 100, tfidfVectors, sequential, named);
        Integer secondTextBlockIdentifierDimensionId = this.validateDictionary(wordCount, conf);
        DictionaryVectorizerTest.validateVectorContainingSecondTextBlock(conf, tfVectors, secondTextBlockIdentifierDimensionId);
    }

    public static void validateVectors(Configuration conf, int numDocs, Path vectorPath, boolean sequential, boolean named) {
        int count = 0;
        for (VectorWritable value : new SequenceFileDirValueIterable(vectorPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
            ++count;
            Vector v = value.get();
            if (named) {
                DictionaryVectorizerTest.assertTrue((String)"Expected NamedVector", (boolean)(v instanceof NamedVector));
                v = ((NamedVector)v).getDelegate();
            }
            if (sequential) {
                DictionaryVectorizerTest.assertTrue((String)"Expected SequentialAccessSparseVector", (boolean)(v instanceof SequentialAccessSparseVector));
                continue;
            }
            DictionaryVectorizerTest.assertTrue((String)"Expected RandomAccessSparseVector", (boolean)(v instanceof RandomAccessSparseVector));
        }
        DictionaryVectorizerTest.assertEquals((String)("Expected " + numDocs + " documents"), (long)numDocs, (long)count);
    }

    private Integer validateDictionary(Path dictionaryDirectoryPath, Configuration conf) {
        PathFilter dictionaryChunkPathFilter = new PathFilter(){

            public boolean accept(Path path) {
                String name = path.getName();
                return name.startsWith("dictionary.file");
            }
        };
        HashMap<String, Integer> dictionary = new HashMap<String, Integer>();
        for (Pair value : new SequenceFileDirIterable(dictionaryDirectoryPath, PathType.LIST, dictionaryChunkPathFilter, null, true, conf)) {
            dictionary.put(((Text)value.getFirst()).toString(), ((IntWritable)value.getSecond()).get());
        }
        Integer secondTextBlockIdentifierDimensionId = (Integer)dictionary.get(SECOND_TEXT_BLOCK_IDENTIFIER.toLowerCase());
        DictionaryVectorizerTest.assertNotNull((String)"Token '2NDBLOCK' must be in dictionary ", (Object)secondTextBlockIdentifierDimensionId);
        DictionaryVectorizerTest.assertTrue((String)"Dictionary must contain more than just 1 element!", (dictionary.size() > 1 ? 1 : 0) != 0);
        return secondTextBlockIdentifierDimensionId;
    }

    public static void validateVectorContainingSecondTextBlock(Configuration conf, Path vectorPath, int dimensionId) {
        for (VectorWritable value : new SequenceFileDirValueIterable(vectorPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
            DictionaryVectorizerTest.assertTrue((String)"The vector must contain the second text block", (value.get().get(dimensionId) > 0.0 ? 1 : 0) != 0);
        }
    }
}

