analyzerConfig
Lucene Analyzer Schema
|
LuceneTextAnalyzer schema for tokenization (JSON-encoded)
type: string
default value: '{ "analyzers": [{ "name": "StdTokLowerStop","charFilters": [ { "type": "htmlstrip" } ],"tokenizer": { "type": "standard" },"filters": [{ "type": "lowercase" },{ "type": "KStem" },{ "type": "length", "min": "2", "max": "32767" },{ "type": "fusionstop", "ignoreCase": "true", "format": "snowball", "words": "org/apache/lucene/analysis/snowball/english_stop.txt" }] }],"fields": [{ "regex": ".+", "analyzer": "StdTokLowerStop" } ]} '
|
autoBalanceClasses
Auto-balance training classes
|
Ensure that all classes of training data have the same size
type: boolean
default value: 'true '
|
dataFormat
Data format
|
Spark-compatible format which training data comes in (like 'solr', 'hdfs', 'file', 'parquet' etc)
type: string
default value: 'solr '
enum: {
solr
hdfs
file
parquet
}
|
elasticNetWeight
Elastic net weight
|
Value between 0 and 1 to interpolate between ridge (0.0) and lasso (1.0) regression
type: number
default value: '0.0 '
exclusiveMaximum: false
maximum: 1.0
|
evaluationMetricType
Evaluation Metric Type
|
Optimize hyperparameter search over one of [binary, multiclass, regression] metrics, or 'none'
type: string
default value: 'none '
enum: {
binary
multiclass
regression
none
}
|
fieldToVectorize
Field to Vectorize
required
|
Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.
type: string
minLength: 1
|
gridSearch
Grid Search with Cross Validation
|
Perform grid search to optimize hyperparameters
type: boolean
default value: 'false '
|
id
Spark Job ID
required
|
The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.
type: string
maxLength: 63
pattern: ^[A-Za-z0-9_\-]+$
|
makeOtherClass
Make 'Other' Class
|
Create a label class 'Other' which contains all examples not in a class large enough to train on
type: boolean
default value: 'true '
|
maxDF
Max Term Document Frequency
|
To be kept, terms must occur in no more than this number of documents (if > 1.0), or no more than this fraction of documents (if <= 1.0)
type: number
default value: '1.0 '
|
maxIters
Maximum number of iterations
|
Maximum number of iterations to perform before halting, even if the convergence criterion has not been met.
type: integer
default value: '10 '
|
minDF
Minimum Term Document Frequency
|
To be kept, terms must occur in at least this number of documents (if > 1.0), or at least this fraction of documents (if <= 1.0)
type: number
default value: '0.0 '
|
minSparkPartitions
Minimum Number of Spark Partitions
|
Minimum number of Spark partitions for training job.
type: integer
default value: '200 '
exclusiveMinimum: false
minimum: 1
|
minTrainingSamplesPerClass
Minimum Labeled Class Size
|
Ensure that all classes of training data have at least this many examples
type: integer
default value: '100 '
exclusiveMinimum: false
minimum: 1
|
modelId
Model ID
|
Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.
type: string
minLength: 1
|
norm
Vector normalization
|
p-norm to normalize vectors with (choose -1 to turn normalization off)
type: integer
default value: '2 '
enum: {
-1
0
1
2
}
|
otherClassName
'Other' class name
|
Label class name for the catch-all 'Other' class
type: string
default value: 'Other '
minLength: 1
|
outputCollection
Output Collection
|
Solr Collection to store model-labeled data to
type: string
|
overwriteExistingModel
Overwrite existing model
|
If a model exists in the model store, overwrite when this job runs
type: boolean
default value: 'true '
|
overwriteOutput
Overwrite Output
|
Overwrite output collection
type: boolean
default value: 'true '
|
predictedLabelField
Predicted Label Field
|
Solr field which will contain labels when classifier is applied to documents
type: string
default value: 'labelPredictedByFusionModel '
|
randomSeed
Random seed
|
For any deterministic pseudorandom number generation
type: integer
default value: '1234 '
|
regularizationWeight
Regularization weight
|
Degree of regularization to use when training (L2 lambda parameter if elasticNetWeight = 0)
type: number
default value: '0.01 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 1.0
minimum: 1.0e-06
|
serializeAsMleap
Serialize as Mleap Bundle
|
Serialize the output model as Mleap Bundle
type: boolean
default value: 'true '
|
sourceFields
Fields to Load
|
Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.
type: string
|
sparkConfig
Spark Settings
|
Spark configuration settings.
type: array of object
object attributes: {
key
(required)
: {
display name: Parameter Name
type: string
}
value
: {
display name: Parameter Value
type: string
}
}
|
stopwordsList
List of stopwords
|
Stopwords defined in Lucene analyzer config
type: array of string
|
trainingCollection
Training Collection
required
|
Solr Collection containing labeled training data
type: string
minLength: 1
|
trainingDataFilterQuery
Training data filter query
|
Solr query to use when loading training data
type: string
default value: '*:* '
minLength: 3
|
trainingDataFrameConfigOptions
Dataframe Config Options
|
Additional spark dataframe loading configuration options
type: object
object attributes: {
}
object attributes: {
}
|
trainingDataSamplingFraction
Training data sampling fraction
|
Fraction of the training data to use
type: number
default value: '1.0 '
exclusiveMaximum: false
maximum: 1.0
|
trainingLabelField
Label Field
required
|
Solr field containing labels for training instances (should be single-valued strings)
type: string
|
type
Spark Job Type
required
|
type: string
default value: 'logistic_regression_classifier_trainer '
enum: {
logistic_regression_classifier_trainer
}
|
w2vDimension
Word2Vec Dimension
|
Word-vector dimensionality to represent text (choose > 0 to use)
type: integer
default value: '0 '
exclusiveMinimum: false
minimum: 0
|
w2vMaxIter
Max Word2Vec Iterations
|
Maximum number of iterations of the word2vec training
type: integer
default value: '1 '
|
w2vMaxSentenceLength
Max Word2Vec Sentence Length
|
Sets the maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength` size.
type: integer
default value: '1000 '
exclusiveMinimum: false
minimum: 3
|
w2vStepSize
Word2Vec Step Size
|
Training parameter for word2vec convergence (change at your own peril)
type: number
default value: '0.025 '
exclusiveMinimum: false
minimum: 0.005
|
w2vWindowSize
Word2Vec Window Size
|
The window size (context words from [-window, window]) for word2vec
type: integer
default value: '5 '
exclusiveMinimum: false
minimum: 3
|
withIdf
IDF Weighting
|
Weight vector components based on inverse document frequency
type: boolean
default value: 'true '
|