public class TextPipeline extends Object
Constructor and Description |
---|
TextPipeline() |
TextPipeline(org.apache.spark.api.java.JavaRDD<String> corpusRDD,
org.apache.spark.broadcast.Broadcast<Map<String,Object>> broadcasTokenizerVarMap) |
Modifier and Type | Method and Description |
---|---|
void |
buildVocabCache() |
void |
buildVocabWordListRDD() |
void |
filterMinWordAddVocab(Counter<String> wordFreq) |
org.apache.spark.broadcast.Broadcast<VocabCache<VocabWord>> |
getBroadCastVocabCache() |
org.apache.spark.api.java.JavaRDD<AtomicLong> |
getSentenceCountRDD() |
org.apache.spark.api.java.JavaRDD<Pair<List<String>,AtomicLong>> |
getSentenceWordsCountRDD() |
Long |
getTotalWordCount() |
VocabCache<VocabWord> |
getVocabCache() |
org.apache.spark.api.java.JavaRDD<List<VocabWord>> |
getVocabWordListRDD() |
org.apache.spark.Accumulator<Counter<String>> |
getWordFreqAcc() |
void |
setRDDVarMap(org.apache.spark.api.java.JavaRDD<String> corpusRDD,
org.apache.spark.broadcast.Broadcast<Map<String,Object>> broadcasTokenizerVarMap) |
org.apache.spark.api.java.JavaRDD<List<String>> |
tokenize() |
org.apache.spark.api.java.JavaRDD<Pair<List<String>,AtomicLong>> |
updateAndReturnAccumulatorVal(org.apache.spark.api.java.JavaRDD<List<String>> tokenizedRDD) |
public void setRDDVarMap(org.apache.spark.api.java.JavaRDD<String> corpusRDD, org.apache.spark.broadcast.Broadcast<Map<String,Object>> broadcasTokenizerVarMap)
public org.apache.spark.api.java.JavaRDD<Pair<List<String>,AtomicLong>> updateAndReturnAccumulatorVal(org.apache.spark.api.java.JavaRDD<List<String>> tokenizedRDD)
public void buildVocabCache()
public void buildVocabWordListRDD()
public org.apache.spark.broadcast.Broadcast<VocabCache<VocabWord>> getBroadCastVocabCache() throws IllegalStateException
IllegalStateException
public VocabCache<VocabWord> getVocabCache() throws IllegalStateException
IllegalStateException
public org.apache.spark.api.java.JavaRDD<Pair<List<String>,AtomicLong>> getSentenceWordsCountRDD()
public org.apache.spark.api.java.JavaRDD<List<VocabWord>> getVocabWordListRDD() throws IllegalStateException
IllegalStateException
public org.apache.spark.api.java.JavaRDD<AtomicLong> getSentenceCountRDD() throws IllegalStateException
IllegalStateException
public Long getTotalWordCount()
Copyright © 2020. All rights reserved.