analyzerConfig
Lucene Analyzer Schema
|
LuceneTextAnalyzer schema for tokenization (JSON-encoded)
type: string
default value: '{ "analyzers": [{ "name": "StdTokLowerStop","charFilters": [ { "type": "htmlstrip" } ],"tokenizer": { "type": "standard" },"filters": [{ "type": "lowercase" },{ "type": "KStem" },{ "type": "patternreplace", "pattern": "^[\\d.]+$", "replacement": " ", "replace": "all" },{ "type": "length", "min": "2", "max": "32767" },{ "type": "fusionstop", "ignoreCase": "true", "format": "snowball", "words": "org/apache/lucene/analysis/snowball/english_stop.txt" }] }],"fields": [{ "regex": ".+", "analyzer": "StdTokLowerStop" } ]} '
minLength: 1
|
clusterIdField
Output Field Name for Cluster Id
|
Output field name for unique cluster id.
type: string
default value: 'cluster_id '
|
clusterLabelField
Detected Cluster Keywords Field Name
|
Output field name for top frequent terms that are (mostly) unique for each cluster.
type: string
default value: 'cluster_label '
|
clusteringMethod
Clustering Method (hierarchical or kmeans)
|
Choose between hierarchical vs kmeans clustering.
type: string
default value: 'hierarchical '
|
dataFormat
Data format
|
Spark-compatible format which training data comes in (like 'solr', 'hdfs', 'file', 'parquet' etc)
type: string
default value: 'solr '
enum: {
solr
hdfs
file
parquet
}
|
distToCenterField
Output Field Name for doc distance to its cluster center
|
Output field name for doc distance to its corresponding cluster center (measure how representative the doc is).
type: string
default value: 'dist_to_center '
|
docLenTrim
Find Extreme Length Doc Flag
|
Whether to separate out docs with extreme lengths.
type: boolean
default value: 'true '
|
fieldToVectorize
Field to Vectorize
required
|
Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.
type: string
minLength: 1
|
freqTermField
Top Frequent Terms Field Name
|
Output field name for top frequent terms in each cluster. These may overlap with other clusters.
type: string
default value: 'freq_terms '
|
id
Spark Job ID
required
|
The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.
type: string
maxLength: 63
pattern: [a-zA-Z][_\-a-zA-Z0-9]*[a-zA-Z0-9]?
|
kDiscount
Discount for K when choosing number of clusters
|
Applies a discount to help favor large/small K (number of clusters). A smaller value pushes K to assume a higher value within the [min, max] K range.
type: number
default value: '0.7 '
|
kExact
Number of Clusters
|
Exact number of clusters.
type: integer
default value: '0 '
|
kMax
Max Possible Number of Clusters
|
Max possible number of clusters.
type: integer
default value: '20 '
|
kMin
Min Possible Number of Clusters
|
Min possible number of clusters.
type: integer
default value: '2 '
|
longLen
Length Threshold for Long Doc
|
Length threshold to define long document. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.
type: number
default value: '0.99 '
|
maxDF
Max Doc Support
|
Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.
type: number
default value: '0.5 '
|
minDF
Min Doc Support
|
Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.
type: number
default value: '5.0 '
|
minDivisibleSize
Minimum divisible cluster size
|
Clusters must have at least this many documents to be split further. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.
type: number
default value: '0.0 '
|
modelId
Model ID
|
Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.
type: string
minLength: 1
|
norm
Vector normalization
|
p-norm to normalize vectors with (choose -1 to turn normalization off)
type: integer
default value: '2 '
enum: {
-1
0
1
2
}
|
numKeywordsPerLabel
Number of Keywords for Each Cluster
|
Number of Keywords needed for labeling each cluster.
type: integer
default value: '5 '
|
outlierK
Number of outlier groups
|
Number of clusters to help find outliers.
type: integer
default value: '10 '
|
outlierThreshold
Outlier cutoff
|
Identify as outlier group if less than this percent of total documents. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.
type: number
default value: '0.01 '
|
outlierTrim
Find Outliers Flag
|
Whether to perform outlier detection.
type: boolean
default value: 'true '
|
outputCollection
Output Collection
required
|
Solr Collection to store model-labeled data to
type: string
minLength: 1
|
overwriteOutput
Overwrite Output
|
Overwrite output collection
type: boolean
default value: 'true '
|
randomSeed
Random seed
|
For any deterministic pseudorandom number generation
type: integer
default value: '1234 '
|
shortLen
Length Threshold for Short Doc
|
Length threshold to define short document. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.
type: number
default value: '5.0 '
|
sourceFields
Fields to Load
|
Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.
type: string
|
sparkConfig
Spark Settings
|
Spark configuration settings.
type: array of object
object attributes: {
key
(required)
: {
display name: Parameter Name
type: string
}
value
: {
display name: Parameter Value
type: string
}
}
|
stopwordsList
List of stopwords
|
Stopwords defined in Lucene analyzer config
type: array of string
|
trainingCollection
Training Collection
required
|
Solr Collection containing documents to be clustered
type: string
minLength: 1
|
trainingDataFilterQuery
Training data filter query
|
Solr query to use when loading training data
type: string
default value: '*:* '
minLength: 3
|
trainingDataFrameConfigOptions
Dataframe Config Options
|
Additional spark dataframe loading configuration options
type: object
object attributes: {
}
object attributes: {
}
|
trainingDataSamplingFraction
Training data sampling fraction
|
Fraction of the training data to use
type: number
default value: '1.0 '
exclusiveMaximum: false
maximum: 1.0
|
type
Spark Job Type
required
|
type: string
default value: 'doc_clustering '
enum: {
doc_clustering
}
|
uidField
ID Field Name
required
|
Field containing the unique ID for each document.
type: string
default value: 'id '
minLength: 1
|
w2vDimension
Word2Vec Dimension
|
Word-vector dimensionality to represent text (choose > 0 to use, suggested dimension ranges: 100~150)
type: integer
default value: '0 '
exclusiveMinimum: false
minimum: 0
|
w2vWindowSize
Word2Vec Window Size
|
The window size (context words from [-window, window]) for word2vec
type: integer
default value: '8 '
exclusiveMinimum: false
minimum: 3
|
writeOptions
Write Options
|
Options used when writing output to Solr.
type: array of object
object attributes: {
key
(required)
: {
display name: Parameter Name
type: string
}
value
: {
display name: Parameter Value
type: string
}
}
|