public class Normalization extends Object
Constructor and Description |
---|
Normalization() |
Modifier and Type | Method and Description |
---|---|
static List<org.apache.spark.sql.Row> |
aggregate(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data,
String[] columns,
String[] functions)
Aggregate based on an arbitrary list
of aggregation and grouping functions
|
static List<org.apache.spark.sql.Row> |
minMaxColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data,
List<String> columns)
Returns the min and max of the given columns
|
static List<org.apache.spark.sql.Row> |
minMaxColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data,
String... columns)
Returns the min and max of the given columns.
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
normalize(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> dataFrame)
Scale based on min,max
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
normalize(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> dataFrame,
double min,
double max)
Scale based on min,max
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
normalize(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> dataFrame,
double min,
double max,
List<String> skipColumns)
Scale based on min,max
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
normalize(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> dataFrame,
List<String> skipColumns)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<List<Writable>> data)
Scale all data 0 to 1
|
static org.apache.spark.api.java.JavaRDD<List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<List<Writable>> data,
double min,
double max)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<List<Writable>> data,
double min,
double max,
List<String> skipColumns)
Scale based on min,max
|
static org.apache.spark.api.java.JavaRDD<List<Writable>> |
normalize(Schema schema,
org.apache.spark.api.java.JavaRDD<List<Writable>> data,
List<String> skipColumns)
Scale all data 0 to 1
|
static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<List<List<Writable>>> data) |
static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<List<List<Writable>>> data,
double min,
double max)
Normalize each column of a sequence, based on min/max
|
static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> |
normalizeSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<List<List<Writable>>> data,
double min,
double max,
List<String> excludeColumns)
Normalize each column of a sequence, based on min/max
|
static List<org.apache.spark.sql.Row> |
stdDevMeanColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data,
List<String> columns)
Returns the standard deviation and mean of the given columns
|
static List<org.apache.spark.sql.Row> |
stdDevMeanColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data,
String... columns)
Returns the standard deviation
and mean of the given columns
The list returned is a list of size 2 where each row
represents the standard deviation of each column and the mean of each column
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
zeromeanUnitVariance(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> frame)
Normalize by zero mean unit variance
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
zeromeanUnitVariance(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> frame,
List<String> skipColumns)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<List<Writable>> |
zeromeanUnitVariance(Schema schema,
org.apache.spark.api.java.JavaRDD<List<Writable>> data)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<List<Writable>> |
zeromeanUnitVariance(Schema schema,
org.apache.spark.api.java.JavaRDD<List<Writable>> data,
List<String> skipColumns)
Normalize by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> |
zeroMeanUnitVarianceSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<List<List<Writable>>> sequence)
Normalize the sequence by zero mean unit variance
|
static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> |
zeroMeanUnitVarianceSequence(Schema schema,
org.apache.spark.api.java.JavaRDD<List<List<Writable>>> sequence,
List<String> excludeColumns)
Normalize the sequence by zero mean unit variance
|
public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> zeromeanUnitVariance(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> frame)
frame
- the data to normalizepublic static org.apache.spark.api.java.JavaRDD<List<Writable>> zeromeanUnitVariance(Schema schema, org.apache.spark.api.java.JavaRDD<List<Writable>> data)
schema
- the schema to use
to create the data framedata
- the data to normalizepublic static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> normalize(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> dataFrame, double min, double max)
dataFrame
- the dataframe to scalemin
- the minimum valuemax
- the maximum valuepublic static org.apache.spark.api.java.JavaRDD<List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<List<Writable>> data, double min, double max)
schema
- the schema of the data to scaledata
- the data to sclaemin
- the minimum valuemax
- the maximum valuepublic static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> normalize(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> dataFrame)
dataFrame
- the dataframe to scalepublic static org.apache.spark.api.java.JavaRDD<List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<List<Writable>> data)
schema
- the schema of the data to scaledata
- the data to scalepublic static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> zeromeanUnitVariance(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> frame, List<String> skipColumns)
frame
- the data to normalizepublic static org.apache.spark.api.java.JavaRDD<List<Writable>> zeromeanUnitVariance(Schema schema, org.apache.spark.api.java.JavaRDD<List<Writable>> data, List<String> skipColumns)
schema
- the schema to use
to create the data framedata
- the data to normalizepublic static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> zeroMeanUnitVarianceSequence(Schema schema, org.apache.spark.api.java.JavaRDD<List<List<Writable>>> sequence)
schema
- Schema of the data to normalizesequence
- Sequence datapublic static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> zeroMeanUnitVarianceSequence(Schema schema, org.apache.spark.api.java.JavaRDD<List<List<Writable>>> sequence, List<String> excludeColumns)
schema
- Schema of the data to normalizesequence
- Sequence dataexcludeColumns
- List of columns to exclude from the normalizationpublic static List<org.apache.spark.sql.Row> minMaxColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data, List<String> columns)
data
- the data to get the max forcolumns
- the columns to get thepublic static List<org.apache.spark.sql.Row> minMaxColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data, String... columns)
data
- the data to get the max forcolumns
- the columns to get thepublic static List<org.apache.spark.sql.Row> stdDevMeanColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data, List<String> columns)
data
- the data to get the max forcolumns
- the columns to get thepublic static List<org.apache.spark.sql.Row> stdDevMeanColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data, String... columns)
data
- the data to get the standard deviation and mean forcolumns
- the columns to get thepublic static List<org.apache.spark.sql.Row> aggregate(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> data, String[] columns, String[] functions)
data
- the dataframe to aggregatecolumns
- the columns to aggregatefunctions
- the functions to usepublic static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> normalize(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> dataFrame, double min, double max, List<String> skipColumns)
dataFrame
- the dataframe to scalemin
- the minimum valuemax
- the maximum valuepublic static org.apache.spark.api.java.JavaRDD<List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<List<Writable>> data, double min, double max, List<String> skipColumns)
schema
- the schema of the data to scaledata
- the data to scalemin
- the minimum valuemax
- the maximum valuepublic static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<List<List<Writable>>> data)
schema
- data
- public static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<List<List<Writable>>> data, double min, double max)
schema
- Schema of the datadata
- Data to normalizemin
- New minimum valuemax
- New maximum valuepublic static org.apache.spark.api.java.JavaRDD<List<List<Writable>>> normalizeSequence(Schema schema, org.apache.spark.api.java.JavaRDD<List<List<Writable>>> data, double min, double max, List<String> excludeColumns)
schema
- Schema of the datadata
- Data to normalizemin
- New minimum valuemax
- New maximum valueexcludeColumns
- List of columns to excludepublic static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> normalize(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> dataFrame, List<String> skipColumns)
dataFrame
- the dataframe to scalepublic static org.apache.spark.api.java.JavaRDD<List<Writable>> normalize(Schema schema, org.apache.spark.api.java.JavaRDD<List<Writable>> data, List<String> skipColumns)
schema
- the schema of the data to scaledata
- the data to scaleCopyright © 2020. All rights reserved.