import requests
url = "https://{FUSION HOST}/api/spark/schema"
headers = {"Authorization": "Basic <encoded-value>"}
response = requests.get(url, headers=headers)
print(response.text){
"type": "object",
"properties": {},
"oneOf": [
{
"type": "object",
"title": "Query-to-Query Session Based Similarity",
"description": "Use this job to to batch compute query-query similarities using a co-occurrence based approach",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"docIdField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input Collection",
"description": "Collection containing queries, document id and event counts. Can be either signal aggregation collection or raw signals collection."
},
"fieldToVectorize": {
"type": "string",
"title": "Query Field Name",
"description": "Field containing queries.",
"default": "query_s",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Data filter query",
"description": "Solr query to additionally filter the input collection.",
"default": "*:*",
"hints": [
"dummy"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output collection",
"description": "Collection to store synonym and similar query pairs.",
"hints": [
"dummy"
]
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"dummy",
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"specialCharsFilterString": {
"type": "string",
"title": "Special characters to be filtered out",
"description": "String of special characters to be filtered from queries.",
"default": "~!@#$^%&*\\(\\)_+={}\\[\\]|;:\"'<,>.?`/\\\\-",
"hints": [
"advanced"
]
},
"minQueryLength": {
"type": "integer",
"title": "Minimum query length",
"description": "Queries below this length (in number of characters) will not be considered for generating recommendations.",
"default": 3,
"minimum": 1,
"exclusiveMinimum": false
},
"maxQueryLength": {
"type": "integer",
"title": "Maximum query length",
"description": "Queries above this length will not be considered for generating recommendations.",
"default": 50,
"minimum": 1,
"exclusiveMinimum": false
},
"countField": {
"type": "string",
"title": "Event Count Field Name",
"description": "Solr field containing number of events (e.g., number of clicks).",
"default": "count_i"
},
"docIdField": {
"type": "string",
"title": "Document id Field Name",
"description": "Solr field containing document id that user clicked.",
"default": "doc_id_s"
},
"overlapThreshold": {
"type": "number",
"title": "Query Similarity Threshold",
"description": "The threshold above which query pairs are consider similar. Decreasing the value can fetch more pairs at the expense of quality.",
"default": 0.3,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"minQueryCount": {
"type": "integer",
"title": "Query Clicks Threshold",
"description": "The minimum number of clicked documents needed for comparing queries.",
"default": 1,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"overlapEnabled": {
"type": "boolean",
"title": "Boost on token overlap",
"description": "Maximize score for query pairs with overlapping tokens by setting score to 1.",
"default": true,
"hints": [
"advanced"
]
},
"tokenOverlapValue": {
"type": "number",
"title": "Minimum match for token overlap",
"description": "Minimum amount of overlap to consider for boosting. To specify overlap in terms of ratio, specify a value in (0, 1). To specify overlap in terms of exact count, specify a value >= 1. If value is 0, boost will be applied if one query is a substring of its pair.Stopwords are ignored while counting overlaps.",
"default": 1,
"hints": [
"advanced"
]
},
"sessionIdField": {
"type": "string",
"title": "Session/User ID field",
"description": "If session id is not available, specify user id field instead. If this field is left blank, session based recommendations will be disabled.",
"default": "session_id_s"
},
"minPairOccCount": {
"type": "integer",
"title": "Minimum query-recommendation pair occurrence count",
"description": "Minimum number of times a query pair must be generated to be considered valid.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"stopwordsBlobName": {
"type": "string",
"title": "Stopwords Blob Store",
"description": "Name of the stopwords blob resource. This is a .txt file with one stopword per line. By default the file is called stopwords/stopwords_nltk_en.txt however a custom file can also be used. Check documentation for more details on format and uploading to blob store.",
"default": "stopwords/stopwords_en.txt",
"reference": "blob",
"blobType": "file:spark"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"similar_queries"
],
"default": "similar_queries",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"countField",
"docIdField",
"sessionIdField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"minQueryLength",
"maxQueryLength",
"specialCharsFilterString",
"stopwordsBlobName",
"overlapThreshold",
"overlapEnabled",
"tokenOverlapValue",
"minQueryCount",
"minPairOccCount"
]
}
]
},
{
"type": "object",
"title": "Smart Answers Coldstart Training (deprecated)",
"description": "Trains Smart Answers model on a cold start (unsupervised) basis with with pre-trained or trained embeddings and deploys the trained model to the ML Model Service",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"textColName",
"deployModelName",
"modelBase",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
"hints": [
"code/sql",
"advanced"
]
},
"textColName": {
"type": "string",
"title": "Field which contains the content documents",
"description": "Field which contains the documents that will be used to learn about the vocabulary. If multiple fields, please separate them by comma, e.g. question,answer."
},
"deployModelName": {
"type": "string",
"title": "Model Deployment Name",
"description": "Name of the model to be used for deployment (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"modelBase": {
"type": "string",
"title": "Model base",
"description": "Specify one of these custom embeddings: ['word_custom', 'bpe_custom'] or choose one of the included pre-trained embeddings / models.",
"enum": [
"word_custom",
"bpe_custom",
"word_en_300d_2M",
"bpe_en_300d_10K",
"bpe_en_300d_200K",
"bpe_ja_300d_100K",
"bpe_ko_300d_100K",
"bpe_zh_300d_50K",
"bpe_multi_300d_320K",
"distilbert_en",
"distilbert_multi",
"biobert_v1.1"
],
"default": "word_en_300d_2M"
},
"testMode": {
"type": "boolean",
"title": "Test Mode",
"description": "If set to true, then the training will exit after the first iteration. Useful for ensuring that the end-to-end pipeline is working",
"default": false,
"hints": [
"hidden"
]
},
"modelReplicas": {
"type": "integer",
"title": "Model replicas",
"description": "How many replicas of the model should be deployed by Seldon Core",
"default": 1
},
"w2vEpochs": {
"type": "integer",
"title": "Word2Vec training epochs",
"description": "Number of epochs to train custom Word2Vec embeddings",
"default": 15,
"hints": [
"advanced"
]
},
"w2vVectorSize": {
"type": "integer",
"title": "Size of word vectors",
"description": "Word-vector dimensionality to represent text (suggested dimension ranges: 100~300)",
"default": 150,
"hints": [
"advanced"
]
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec window size",
"description": "The window size (context words from [-window, window]) for Word2Vec",
"default": 8,
"hints": [
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "The proportion of data to be sampled from the full dataset. Use a value between 0 and 1 for a proportion (e.g. 0.5 for 50%), or for a specific number of examples, use an integer larger than 1. Leave blank for no sampling",
"hints": [
"advanced"
]
},
"seed": {
"type": "integer",
"title": "Seed",
"description": "Random seed for sampling",
"default": 12345,
"hints": [
"hidden"
]
},
"minTokensNum": {
"type": "integer",
"title": "Minimum number of words in doc",
"description": "Drop document if the total words is lower than this value",
"default": 1,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"maxTokensNum": {
"type": "integer",
"title": "Maximum number of words in doc",
"description": "Drop document if the total words is greater than this value",
"default": 5000,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"lowerCases": {
"type": "boolean",
"title": "Lower case all words",
"description": "Whether to lower case all words in training, i.e. whether to treat upper case and lower case words equally. Only utilized for custom embeddings or for the default model base: word_en_300d_2M.",
"default": true
},
"maxVocabSize": {
"type": "integer",
"title": "Maximum vocabulary size",
"description": "Maximum number of words in vocabulary, words will be trimmed if frequency is too low",
"default": 100000,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"extraTrainingArgs": {
"type": "string",
"title": "Extra training args for Python scripts",
"description": "Add any additional arguments for the Python training scripts in this field",
"hints": [
"hidden"
]
},
"maxLen": {
"type": "integer",
"title": "Max Length",
"description": "Max length of question/answer by number of tokens"
},
"infBatch": {
"type": "integer",
"title": "Inference batch size",
"description": "The batch size used for encoding during the training",
"hints": [
"advanced"
]
},
"numClusters": {
"type": "integer",
"title": "Number of clusters",
"description": "DEPRECATED: please, consider using Milvus for fast dense vector similarity search. Number of clusters to be used for fast dense vector retrieval. Note no clustering will be applied if this is set to 0. If left blank, cluster count will be inferred by the job depending on the data",
"default": 0,
"hints": [
"advanced"
]
},
"topKClusters": {
"type": "integer",
"title": "Top k of clusters to return",
"description": "How many closest clusters the model can find for each query. At retrieval time, all answers in top k nearest clusters will be returned and reranked",
"default": 10,
"hints": [
"advanced"
]
},
"unidecode": {
"type": "boolean",
"title": "Apply unicode decoding",
"description": "Use Unidecode library to transform Unicode input into ASCII transliterations. Only utilized for custom embeddings or for the default model base: word_en_300d_2M",
"default": true
},
"globalPoolType": {
"type": "string",
"title": "Global Pool Type",
"description": "Determines how token vectors should be aggregated to obtain final content vector. Must be one of: [avg, max].",
"enum": [
"avg",
"max"
],
"default": "avg",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-qna-coldstart"
],
"default": "argo-qna-coldstart",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"textColName",
"deployModelName",
"modelReplicas",
"secretName",
"testMode"
]
},
{
"label": "Data Preprocessing",
"properties": [
"trainingDataFilterQuery",
"trainingSampleFraction",
"seed",
"minTokensNum",
"maxTokensNum",
"lowerCases",
"unidecode",
"maxVocabSize"
]
},
{
"label": "Custom Embeddings Initialization",
"properties": [
"w2vEpochs",
"w2vVectorSize",
"w2vWindowSize"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"maxLen",
"infBatch",
"numClusters",
"topKClusters",
"globalPoolType"
]
}
]
},
{
"type": "object",
"title": "Ranking Metrics",
"description": "use this job to calculate relevance metrics (nDCG etc..) by replaying ground truth queries (see ground truth job) against catalog data using variants from an experiment.",
"required": [
"id",
"groundTruthConfig",
"rankingExperimentConfig",
"outputCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"groundTruthConfig": {
"type": "object",
"title": "Configure ground truth dataset",
"description": "Configure properties for Ground truth dataset",
"required": [
"inputCollection"
],
"properties": {
"inputCollection": {
"type": "string",
"title": "Ground Truth Input Collection",
"description": "Input collection representing ground truth dataset",
"minLength": 1
},
"filterQueries": {
"type": "array",
"title": "Filter queries",
"description": "Solr filter queries to apply against Ground truth collection",
"default": [
"type:ground_truth"
],
"hints": [
"advanced"
],
"items": {
"type": "string",
"default": "[\"type:ground_truth\"]"
}
},
"queryField": {
"type": "string",
"title": "Query field",
"description": "Query field in the collection",
"default": "query",
"hints": [
"advanced"
]
},
"docIdField": {
"type": "string",
"title": "Doc ID field",
"description": "Field containing ranked doc id's",
"default": "docId",
"hints": [
"advanced"
]
},
"weightField": {
"type": "string",
"title": "Weight Field",
"description": "Field representing the weight of document to the query",
"default": "weight",
"hints": [
"advanced"
]
}
}
},
"rankingExperimentConfig": {
"type": "object",
"title": "Configure experiment",
"description": "Configure properties for the experiment",
"properties": {
"inputCollection": {
"type": "string",
"title": "Input Collection",
"description": "Collection to run the experiment on",
"hints": [
"advanced"
],
"minLength": 1
},
"queryPipelines": {
"type": "array",
"title": "Query pipelines",
"description": "Pipeline variants for experiment",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"docIdField": {
"type": "string",
"title": "Doc Id Field",
"description": "Doc id field to retrieve values (Must return values that match the ground truth data)",
"default": "id",
"hints": [
"advanced"
]
},
"experimentId": {
"type": "string",
"title": "Experiment ID",
"description": "Calculate ranking metrics using variants from experiment",
"minLength": 1
},
"experimentObjectiveName": {
"type": "string",
"title": "Experiment metric name",
"description": "Experiment objective name",
"minLength": 1
},
"defaultProfile": {
"type": "string",
"title": "Default Query Profile",
"description": "Default query profile to use if not specified in experiment variants"
}
}
},
"outputCollection": {
"type": "string",
"title": "Output collection",
"description": "Output collection to save the ranking metrics to",
"minLength": 1
},
"rankingPositionK": {
"type": "integer",
"title": "Ranking Position @K",
"description": "Ranking position at K for metrics calculation",
"default": 10,
"hints": [
"advanced"
]
},
"metricsPerQuery": {
"type": "boolean",
"title": "Calculate metrics per query",
"description": "Calculate ranking metrics per each query in ground truth set and save them to Solr collection",
"default": true,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"ranking_metrics"
],
"default": "ranking_metrics",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Ground Truth Parameters",
"properties": [
"groundTruthConfig"
]
},
{
"label": "Ranking Experiment Parameters",
"properties": [
"rankingExperimentConfig"
]
}
]
},
{
"type": "object",
"title": "Data Augmentation (deprecated)",
"description": "Use this job to perform Text Augmentation",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"outputCollection",
"outputFormat",
"includeOriginalData",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Input format",
"description": "The format of the training data - solr, parquet etc.",
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
"hints": [
"code/sql",
"advanced"
]
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 12345,
"hints": [
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Choose a fraction of the data for training.",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"batchSize": {
"type": "string",
"title": "Batch Size",
"description": "If writing to solr, this field defines the batch size for documents to be pushed to solr.",
"default": "15000",
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output path",
"description": "Output collection to store generated augmented data.",
"minLength": 1
},
"outputFormat": {
"type": "string",
"title": "Output Format",
"description": "The format of the output data - solr, parquet etc.",
"minLength": 1
},
"partitionFields": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"backTranslations": {
"type": "array",
"title": "Back Translation",
"description": "Augment data via translation to a different language and then translating back to original language. Chain of languages can be used for translation. Works at sentence level for medium-long length text. GPU recommended and will be used when available.",
"items": {
"type": "object",
"required": [
"fieldname",
"inputLanguage"
],
"properties": {
"fieldname": {
"type": "string",
"title": "Field Name",
"description": "Name of the input field to augment.",
"minLength": 1
},
"inputLanguage": {
"type": "string",
"title": "Input data Language",
"description": "Language of input data.",
"enum": [
"English",
"French",
"German",
"Italian",
"Spanish",
"Dutch",
"Polish",
"Hebrew",
"Ukrainian",
"Chinese",
"Japanese",
"Korean"
],
"minLength": 1
},
"intermediateLanguage": {
"type": "string",
"title": "Intermediate Language",
"description": "Specify languages in order to be used in back translation separated by comma. Only use languages present in input data language dropdown. Bigger chains will take more time to augment. ",
"default": "German",
"pattern": "((?:English|German|French|Italian|Spanish|Dutch|Polish|Ukrainian|Hebrew|Chinese|Japanese|Korean)*(\\s)*(,)*(\\s)*){0,12}"
},
"batchSize": {
"type": "integer",
"title": "Batch Size",
"description": "Number of input data samples to back-translate at a time. Important if Use GPU is checked to avoid memory overflow.",
"default": 256,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"beamSize": {
"type": "integer",
"title": "Beam Size",
"description": "Number of beams to evaluate during translation. Use higher number if translation is poor. Higher number will increase execution time and memory use.",
"default": 1,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"minSentenceLength": {
"type": "integer",
"title": "Min translation length (tokens)",
"description": "Do not back translate sentences shorter than specified length in tokens. If the value is more than max translation length, then max translation length will be used.",
"default": 40,
"hints": [
"advanced"
],
"maximum": 510,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"maxSentenceLength": {
"type": "integer",
"title": "Max translation length (tokens)",
"description": "Do not back translate sentences longer than specified length in tokens. If the value is less than min translation length, hen min translation length will be used.",
"default": 240,
"hints": [
"advanced"
],
"maximum": 510,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
}
}
}
},
"keyStrokeMisspellings": {
"type": "array",
"title": "Keystroke Misspellings",
"description": "Augment data via insertion, substitution, swapping and deletion of characters based on keyboard layout. Useful for short text.",
"items": {
"type": "object",
"required": [
"fieldname",
"inputLanguage"
],
"properties": {
"fieldname": {
"type": "string",
"title": "Field Name",
"description": "Name of the input field to augment.",
"minLength": 1
},
"inputLanguage": {
"type": "string",
"title": "Input data Language",
"description": "Language of input data.",
"enum": [
"English",
"French",
"German",
"Italian",
"Spanish",
"Dutch",
"Polish",
"Hebrew",
"Ukrainian"
],
"minLength": 1
},
"minCharAugment": {
"type": "integer",
"title": "Minimum Chars to Augment",
"description": "Minimum number of characters to augment in each word. If the value is more than Maximum Chars to Augment, then Maximum Chars to Augment will be used.",
"default": 1,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"maxCharAugment": {
"type": "integer",
"title": "Maximum Chars to Augment",
"description": "Maximum number of characters to augment in each word. If the value is more than Minimum Chars to Augment, then Minimum Chars to Augment will be used.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"minWordsToAugment": {
"type": "integer",
"title": "Min words to Augment",
"description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"maxWordsToAugment": {
"type": "integer",
"title": "Max words to Augment",
"description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
"default": 10,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"wordPercentageToAugment": {
"type": "number",
"title": "Percentage words to Augment",
"description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
"default": 0.2,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"keywordsBlobName": {
"type": "string",
"title": "Keystroke Mapping",
"description": "Keystroke Mapping for required language in JSON format from blob store.",
"hints": [
"advanced"
],
"reference": "blob",
"blobType": "file:spark"
}
}
}
},
"synonymSubstitutions": {
"type": "array",
"title": "Synonym Substitution",
"description": "Augment data via substituting words using synonyms from wordnet or user supplied dictionary. Useful for short, medium and long text. Faster and less resource intensive than back translation.",
"items": {
"type": "object",
"required": [
"fieldname",
"inputLanguage"
],
"properties": {
"fieldname": {
"type": "string",
"title": "Field Name",
"description": "Name of the input field to augment.",
"minLength": 1
},
"inputLanguage": {
"type": "string",
"title": "Input data Language",
"description": "Language of input data.",
"enum": [
"English",
"French",
"German",
"Italian",
"Spanish",
"Dutch",
"Polish",
"Hebrew",
"Chinese",
"Japanese"
],
"minLength": 1
},
"minWordsToAugment": {
"type": "integer",
"title": "Min words to Augment",
"description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"maxWordsToAugment": {
"type": "integer",
"title": "Max words to Augment",
"description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
"default": 10,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"wordPercentageToAugment": {
"type": "number",
"title": "Percentage of words to Augment",
"description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
"default": 0.2,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"stopwordsBlobName": {
"type": "string",
"title": "Synonym Dictionary Name",
"description": "Wordnet format dictionary to use from blob store",
"hints": [
"advanced"
],
"reference": "blob",
"blobType": "file:spark"
}
}
}
},
"splitWords": {
"type": "array",
"title": "Split Words",
"description": "Augment data via splitting some words. Useful for short, medium and long text.",
"items": {
"type": "object",
"required": [
"fieldname",
"inputLanguage"
],
"properties": {
"fieldname": {
"type": "string",
"title": "Field Name",
"description": "Name of the input field to augment.",
"minLength": 1
},
"inputLanguage": {
"type": "string",
"title": "Input data Language",
"description": "Language of input data.",
"enum": [
"English",
"French",
"German",
"Italian",
"Spanish",
"Dutch",
"Polish"
],
"minLength": 1
},
"minWordLength": {
"type": "integer",
"title": "Minimum Word Length",
"description": "Do not augment words less than this length (in characters). If the value is more than maximum word length, then maximum word length will be used.",
"default": 4,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"minWordsToAugment": {
"type": "integer",
"title": "Min words to Augment",
"description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"maxWordsToAugment": {
"type": "integer",
"title": "Max words to Augment",
"description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
"default": 10,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"wordPercentageToAugment": {
"type": "number",
"title": "Percentage of words to Augment",
"description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
"default": 0.2,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
}
}
}
},
"includeOriginalData": {
"type": "boolean",
"title": "Include original data",
"description": "When checked original data will be included in the augmented dataset",
"default": true
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-data-augmentation"
],
"default": "argo-data-augmentation",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"trainingDataFilterQuery",
"trainingSampleFraction",
"randomSeed",
"batchSize",
"outputCollection",
"outputFormat",
"partitionFields",
"secretName",
"includeOriginalData"
]
},
{
"label": "Augmentation Parameters",
"properties": [
"backTranslations",
"keyStrokeMisspellings",
"synonymSubstitutions",
"splitWords"
]
}
]
},
{
"type": "object",
"title": "Create Indexes in Milvus (deprecated)",
"description": "Creates indexes for specified collections in Milvus",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"indexes-list": {
"type": "array",
"title": "Indexes",
"description": "List of the indexes that should be created with corresponding params.",
"items": {
"type": "object",
"required": [
"milvusCollectionName",
"indexType"
],
"properties": {
"milvusCollectionName": {
"type": "string",
"title": "Collection Name",
"description": "Name of the collection in Milvus in which index should be created"
},
"indexType": {
"type": "string",
"title": "Index Type",
"description": "Index type which should be create for specified collection",
"enum": [
"FLAT",
"IVFLAT",
"IVF_SQ8",
"RNSG",
"IVF_SQ8H",
"IVF_PQ",
"HNSW",
"ANNOY"
],
"default": "HNSW"
},
"indexParams": {
"type": "array",
"title": "Index Parameters",
"description": "Parameters to be used to create index in Milvus. Specific to the chosen IndexType. For example, good starting values might be [M=36, efConstruction=500] for HNSW index and [nlist=4×sqrt(number of vectors)] for IVF indexes.",
"items": {
"type": "object",
"properties": {
"key": {
"type": "string",
"title": "Milvus Index Param",
"description": "The name of the Milvus index params like M / efConstruction for HNSW or nlist for IVF indexes."
},
"value": {
"type": "integer",
"title": "Value",
"description": "Index param value. For example, good starting values might be [M=36, efConstruction=500] for HNSW index and [nlist=4×sqrt(number of vectors)] for IVF indexes."
}
}
}
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-milvus-create-indexes"
],
"default": "argo-milvus-create-indexes",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Custom Python Job",
"description": "Use this job when you want to run a python/pyspark job",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"script": {
"type": "string",
"title": "Python Script",
"description": "Custom python/pyspark script to be submitted as a Fusion job",
"hints": [
"code/python",
"lengthy"
],
"minLength": 1
},
"resourceName": {
"type": "string",
"title": "Blob Resource (python file)",
"description": "Name of the resource uploaded to Blob store. This should match with the Blob name",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
},
"pythonFiles": {
"type": "array",
"title": "Python Files",
"description": "Blob resource (.zip, .egg, .py files) to place on the PYTHONPATH for Python apps",
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"submitArgs": {
"type": "array",
"title": "Spark args",
"description": "Additional options to pass to the Spark Submit when running this job.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"javaOptions": {
"type": "array",
"title": "Java options",
"description": "Java options to pass to Spark driver/executor",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"verboseReporting": {
"type": "boolean",
"title": "Verbose reporting",
"description": "Enables verbose reporting for SparkSubmit",
"default": true,
"hints": [
"advanced"
]
},
"envOptions": {
"type": "array",
"title": "ENV properties",
"description": "Set environment variables for driver",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"custom_python_job"
],
"default": "custom_python_job",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Head/Tail Analysis (Deprecated)",
"description": "Use this job when you want to compare the head and tail of your queries to find common misspellings and rewritings. See the insights analytics pane for a review of the results of the job. This job is deprecated.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"countField",
"mainType",
"signalTypeField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input Collection",
"description": "Signals collection containing queries and event counts. Raw signals or aggregation collection can be used. If aggregation collection is being used, update the filter query in advanced options",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Query Field Name",
"description": "Field containing the queries",
"default": "query",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Signals data filter query",
"description": "Solr query to use when loading training data if using Solr (e.g. type:click OR type:response), Spark SQL expression for all other data sources",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr collection to store head tail analytics results. Defaults to job reports collection"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"tailRewriteCollection": {
"type": "string",
"title": "Tail Rewrite Collection",
"description": "Collection where tail rewrites are stored.",
"minLength": 1
},
"analyzerConfigQuery": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [ { \"name\": \"StdTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"englishminimalstem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStem\" } ]}",
"hints": [
"lengthy",
"advanced",
"code/json"
],
"minLength": 1
},
"countField": {
"type": "string",
"title": "Event Count Field Name",
"description": "Field containing the number of times an event (like a click) occurs for a particular query; count_i in the raw signal collection or aggr_count_i in the aggregated signal collection.",
"default": "count_i",
"minLength": 1
},
"mainType": {
"type": "string",
"title": "Main Event Type",
"description": "The main signal event type (e.g. click) that head tail analysis is based on. E.g., if main type is click, then head and tail queries are defined by the number of clicks.",
"default": "click",
"minLength": 1
},
"filterType": {
"type": "string",
"title": "Filtering Event Type",
"description": "The secondary event type (e.g. response) that can be used for filtering out rare searches. Note: In order to use the `response` default value, please make sure you have type:response in the input collection. If there is no need to filter on number of searches, please leave this parameter blank.",
"default": "response"
},
"signalTypeField": {
"type": "string",
"title": "Field Name of Signal Type",
"description": "The field name of signal type in the input collection.",
"default": "type"
},
"minCountMain": {
"type": "integer",
"title": "Minimum Main Event Count",
"description": "Minimum number of main events (e.g. clicks after aggregation) necessary for the query to be considered. The job will only analyze queries with clicks greater or equal to this number.",
"default": 1
},
"minCountFilter": {
"type": "integer",
"title": "Minimum Filtering Event Count",
"description": "Minimum number of filtering events (e.g. searches after aggregation) necessary for the query to be considered. The job will only analyze queries that were issued greater or equal to this number of times.",
"default": 20
},
"queryLenThreshold": {
"type": "integer",
"title": "Minimum Query Length ",
"description": "Minimum length of a query to be included for analysis. The job will only analyze queries with length greater than or equal to this value.",
"default": 2
},
"userHead": {
"type": "number",
"title": "Head Count Threshold",
"description": "User defined threshold for head definition. value=-1.0 will allow the program to pick the number automatically. value<1.0 denotes a percentage (e.g 0.1 means put the top 10% of queries into the head), value=1.0 denotes 100% (e.g 1 means put all queries into the head), value>1.0 denotes the exact number of queries to put in the head (e.g 100 means the top 100 queries constitute the head)",
"default": -1,
"hints": [
"advanced"
]
},
"userTail": {
"type": "number",
"title": "Tail Count Threshold",
"description": "User defined threshold for tail definition. value=-1.0 will allow the program to pick the number automatically. value<1.0 denotes a percentage, (e.g 0.1 means put the bottom 10% of queries into the tail) value=1.0 denotes 100% (e.g 1 means put all queries into the tail), value>1.0 denotes the exact number of queries to put into the tail (e.g 100 means the bottom 100 queries constitute the tail).",
"default": -1,
"hints": [
"advanced"
]
},
"topQ": {
"type": "array",
"title": "Top X% Head Query Event Count",
"description": "Compute how many total events come from the top X head queries (Either a number greater than or equal to 1.0 or a percentage of the total number of unique queries)",
"default": [
100,
0.01
],
"hints": [
"advanced"
],
"items": {
"type": "number"
}
},
"trafficPerc": {
"type": "array",
"title": "Number of Queries that Constitute X% of Total Events",
"description": "Compute how many queries constitute each of the specified event portions(E.g., 0.25, 0.50)",
"default": [
0.25,
0.5,
0.75
],
"hints": [
"advanced"
],
"items": {
"type": "number"
}
},
"lastTraffic": {
"type": "array",
"title": "Bottom X% Tail Query Event Count",
"description": "Compute the total number of queries that are spread over each of the specified tail event portions (E.g., 0.01)",
"default": [
0.01
],
"hints": [
"advanced"
],
"items": {
"type": "number"
}
},
"trafficCount": {
"type": "array",
"title": "Event Count Computation Threshold",
"description": "Compute how many queries have events less than each value specified (E.g., a value of 5.0 would return the number of queries that have less than 5 associated events)",
"default": [
5
],
"hints": [
"advanced"
],
"items": {
"type": "number"
}
},
"keywordsBlobName": {
"type": "string",
"title": "Keywords blob name",
"description": "Name of the keywords blob resource. Typically, this should be a csv file uploaded to blob store in a specific format. Check documentation for more details on format and uploading to blob store ",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
},
"lenScale": {
"type": "integer",
"title": "Edit Distance vs String Length Scale",
"description": "A scaling factor used to normalize the length of the query string. This filters head and tail string match based on if edit_dist <= string_length/length_scale. A large value for this factor leads to a shorter spelling list. A smaller value leads to a longer spelling list but may add lower quality corrections.",
"default": 6,
"hints": [
"advanced"
]
},
"overlapThreshold": {
"type": "integer",
"title": "Head and tail Overlap threshold",
"description": "The threshold for the number of overlapping tokens between the head and tail. When a head string and tail string share more tokens than this threshold, they are considered a good match.",
"default": 4,
"hints": [
"advanced"
]
},
"overlapNumBoost": {
"type": "number",
"title": "Token Overlap Number Boost",
"description": "When there are multiple possible head matches for a tail, we rank heads based on: overlapNumBoost * overlapNum + headQueryCountBoost * log(headQueryCount). A big number puts more weight on how many tokens match between the head and tail query strings instead of the number of times a head query appears.",
"default": 10,
"hints": [
"hidden",
"advanced"
]
},
"headQueryCntBoost": {
"type": "number",
"title": "Head query count boost",
"description": "When there are multiple possible head matches for tail, we rank heads based on: overlapNumBoost * overlapNum + headQueryCountBoost * log(headQueryCount). A big number puts more weight on the count head query instead of the number of tokens shared between the head and tail query strings",
"default": 1,
"hints": [
"hidden",
"advanced"
]
},
"tailRewrite": {
"type": "boolean",
"title": "Generate tail rewrite table",
"description": "If true, also generate tail rewrite table, o.w., only get distributions. May need to set it to false in the very first run to help customize head and tail positions.",
"default": true,
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"enableAutoPublish": {
"type": "boolean",
"title": "Enable auto-publishing",
"description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
"default": false,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"headTailAnalysis"
],
"default": "headTailAnalysis",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"signalTypeField",
"mainType",
"filterType",
"countField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"minCountMain",
"minCountFilter",
"tailRewrite",
"userHead",
"userTail",
"lenScale",
"overlapThreshold",
"topQ",
"trafficCount",
"trafficPerc",
"lastTraffic"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfigQuery",
"queryLenThreshold"
]
},
{
"label": "Misc. Parameters",
"properties": [
"keywordsBlobName"
]
}
]
},
{
"type": "object",
"title": "SQL Aggregation",
"description": "Use this job when you want to aggregate your data in some way.",
"required": [
"id",
"inputCollection",
"sql",
"dataFormat",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"inputCollection": {
"type": "string",
"title": "Source Collection",
"description": "Collection containing signals to be aggregated."
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "The collection to write the aggregates to on output. This property is required if the selected output / rollup pipeline requires it (the default pipeline does). A special value of '-' disables the output.",
"hints": [
"advanced"
],
"minLength": 1
},
"rows": {
"type": "integer",
"title": "Batch Size",
"description": "Number of rows to read from the source collection per request.",
"default": 10000,
"hints": [
"advanced"
]
},
"sql": {
"type": "string",
"title": "SQL",
"description": "Use SQL to perform the aggregation. You do not need to include a time range filter in the WHERE clause as it gets applied automatically before executing the SQL statement.",
"hints": [
"lengthy",
"code/sql"
],
"minLength": 1
},
"rollupSql": {
"type": "string",
"title": "Rollup SQL",
"description": "Use SQL to perform a rollup of previously aggregated docs. If left blank, the aggregation framework will supply a default SQL query to rollup aggregated metrics.",
"hints": [
"lengthy",
"code/sql",
"advanced"
],
"minLength": 1
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Additional configuration settings to fine-tune how input records are read for this aggregation.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"sourceCatchup": {
"type": "boolean",
"title": "Aggregate New and Merge with Existing",
"description": "If checked, only aggregate new signals created since the last time the job was successfully run. If there is a record of such previous run then this overrides the starting time of time range set in 'timeRange' property. If unchecked, then all matching signals are aggregated and any previously aggregated docs are deleted to avoid double counting.",
"default": true,
"hints": [
"advanced"
]
},
"sourceRemove": {
"type": "boolean",
"title": "Remove signals from source",
"description": "If checked, remove signals from source collection once aggregation job has finished running.",
"default": false,
"hints": [
"advanced"
]
},
"aggregationTime": {
"type": "string",
"title": "Aggregation Time",
"description": "Timestamp to use for the aggregation results. Defaults to NOW.",
"hints": [
"advanced"
],
"format": "date-time"
},
"referenceTime": {
"type": "string",
"title": "Reference Time",
"description": "Timestamp to use for computing decays and to determine the value of NOW.",
"hints": [
"advanced"
],
"format": "date-time"
},
"skipCheckEnabled": {
"type": "boolean",
"title": "Job Skip Check Enabled?",
"description": "If the catch-up flag is enabled and this field is checked, the job framework will execute a fast Solr query to determine if this run can be skipped.",
"default": true,
"hints": [
"advanced"
]
},
"skipJobIfSignalsEmpty": {
"type": "boolean",
"title": "Skip Job run",
"description": "Skip Job run if signals collection is empty",
"hints": [
"advanced"
]
},
"parameters": {
"type": "array",
"title": "Parameters",
"description": "Other aggregation parameters (e.g. timestamp field etc..).",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"signalTypes": {
"type": "array",
"title": "Signal Types",
"description": "The signal types. If not set then any signal type is selected",
"items": {
"type": "string"
}
},
"selectQuery": {
"type": "string",
"title": "Query",
"description": "The query to select the desired input documents.",
"default": "*:*",
"hints": [
"advanced"
],
"minLength": 1
},
"timeRange": {
"type": "string",
"title": "Time Range",
"description": "The time range to select signals on.",
"hints": [
"advanced"
],
"minLength": 1
},
"useNaturalKey": {
"type": "boolean",
"title": "Use Natural Key?",
"description": "Use a natural key provided in the raw signals data for aggregation, rather than relying on Solr UUIDs. Migrated aggregations jobs from Fusion 4 will need this set to false.",
"default": true,
"hints": [
"advanced"
]
},
"optimizeSegments": {
"type": "integer",
"title": "Optimize Segments",
"description": "If set to a value above 0, the aggregator job will optimize the resulting Solr collection into this many segments",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"aggregation"
],
"default": "aggregation",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Random Forest Classifier Training (deprecated)",
"description": "Use this job when you have training data and you want to train a random forest model to classify text into groups. Deprecated as of Fusion 5.2.0 and will be removed in a future release; use the Classification job instead.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"trainingLabelField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"advanced",
"code/json",
"lengthy"
]
},
"withIdf": {
"type": "boolean",
"title": "IDF Weighting",
"description": "Weight vector components based on inverse document frequency",
"default": true,
"hints": [
"advanced"
]
},
"w2vDimension": {
"type": "integer",
"title": "Word2Vec Dimension",
"description": "Word-vector dimensionality to represent text (choose > 0 to use)",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec Window Size",
"description": "The window size (context words from [-window, window]) for word2vec",
"default": 5,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxSentenceLength": {
"type": "integer",
"title": "Max Word2Vec Sentence Length",
"description": "Sets the maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength` size.",
"default": 1000,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxIter": {
"type": "integer",
"title": "Max Word2Vec Iterations",
"description": "Maximum number of iterations of the word2vec training",
"default": 1,
"hints": [
"advanced"
]
},
"w2vStepSize": {
"type": "number",
"title": "Word2Vec Step Size",
"description": "Training parameter for word2vec convergence (change at your own peril)",
"default": 0.025,
"hints": [
"advanced"
],
"minimum": 0.005,
"exclusiveMinimum": false
},
"minDF": {
"type": "number",
"title": "Minimum Term Document Frequency",
"description": "To be kept, terms must occur in at least this number of documents (if > 1.0), or at least this fraction of documents (if <= 1.0)",
"default": 0,
"hints": [
"advanced"
]
},
"maxDF": {
"type": "number",
"title": "Max Term Document Frequency",
"description": "To be kept, terms must occur in no more than this number of documents (if > 1.0), or no more than this fraction of documents (if <= 1.0)",
"default": 1,
"hints": [
"advanced"
]
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"predictedLabelField": {
"type": "string",
"title": "Predicted Label Field",
"description": "Solr field which will contain labels when classifier is applied to documents",
"default": "labelPredictedByFusionModel",
"hints": [
"advanced"
]
},
"serializeAsMleap": {
"type": "boolean",
"title": "Serialize as Mleap Bundle",
"description": "Serialize the output model as Mleap Bundle",
"default": true,
"hints": [
"hidden"
]
},
"minSparkPartitions": {
"type": "integer",
"title": "Minimum Number of Spark Partitions",
"description": "Minimum number of Spark partitions for training job.",
"default": 200,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"overwriteExistingModel": {
"type": "boolean",
"title": "Overwrite existing model",
"description": "If a model exists in the model store, overwrite when this job runs",
"default": true,
"hints": [
"advanced"
]
},
"trainingLabelField": {
"type": "string",
"title": "Label Field",
"description": "Solr field containing labels for training instances (should be single-valued strings)"
},
"gridSearch": {
"type": "boolean",
"title": "Grid Search with Cross Validation",
"description": "Perform grid search to optimize hyperparameters",
"default": false
},
"evaluationMetricType": {
"type": "string",
"title": "Evaluation Metric Type",
"description": "Optimize hyperparameter search over one of [binary, multiclass, regression] metrics, or 'none'",
"enum": [
"binary",
"multiclass",
"regression",
"none"
],
"default": "none",
"hints": [
"advanced"
]
},
"autoBalanceClasses": {
"type": "boolean",
"title": "Auto-balance training classes",
"description": "Ensure that all classes of training data have the same size",
"default": true,
"hints": [
"advanced"
]
},
"minTrainingSamplesPerClass": {
"type": "integer",
"title": "Minimum Labeled Class Size",
"description": "Ensure that all classes of training data have at least this many examples",
"default": 100,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"makeOtherClass": {
"type": "boolean",
"title": "Make 'Other' Class",
"description": "Create a label class 'Other' which contains all examples not in a class large enough to train on",
"default": true,
"hints": [
"advanced"
]
},
"otherClassName": {
"type": "string",
"title": "'Other' class name",
"description": "Label class name for the catch-all 'Other' class",
"default": "Other",
"hints": [
"advanced"
],
"minLength": 1
},
"maxDepth": {
"type": "integer",
"title": "Maximum tree depth",
"description": "Maximum depth of the tree (>= 0). E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.",
"default": 5,
"maximum": 20,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false
},
"maxBins": {
"type": "integer",
"title": "Maximum number of discretizing bins",
"description": "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.",
"default": 32,
"maximum": 128,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"numTrees": {
"type": "integer",
"title": "Number of trees",
"description": "Number of trees to train (>= 1)",
"default": 20,
"maximum": 1000,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"random_forests_classifier"
],
"default": "random_forests_classifier",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"predictedLabelField",
"trainingLabelField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"w2vDimension",
"w2vWindowSize",
"w2vMaxIter",
"w2vMaxSentenceLength",
"w2vStepSize",
"withIdf",
"maxDF",
"minDF",
"norm",
"autoBalanceClasses",
"evaluationMetricType",
"minTrainingSamplesPerClass",
"otherClassName",
"makeOtherClass",
"gridSearch",
"maxBins",
"numTrees",
"maxDepth"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Create Collections in Milvus (deprecated)",
"description": "Creates collections with specified parameters in Milvus",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"collections-list": {
"type": "array",
"title": "Collections",
"description": "List of the collections that should be created with corresponding params.",
"items": {
"type": "object",
"required": [
"milvusCollectionName",
"dimension",
"indexFileSize",
"metric"
],
"properties": {
"milvusCollectionName": {
"type": "string",
"title": "Collection Name",
"description": "Name of the collection to create in Milvus",
"pattern": "^[a-zA-Z0-9_]+$"
},
"dimension": {
"type": "integer",
"title": "Dimension",
"description": "Dimension size of vectors to be stored in the collection"
},
"indexFileSize": {
"type": "integer",
"title": "Index File Size",
"description": "Files larger than this will trigger index building for raw data files",
"default": 1024,
"minimum": 1,
"exclusiveMinimum": false
},
"metric": {
"type": "string",
"title": "Metric",
"description": "Metric which should be used for vectors similarity",
"enum": [
"Euclidean",
"Inner Product",
"Hamming",
"Jaccard",
"Tanimoto",
"Substructure",
"Superstructure"
],
"default": "Inner Product"
}
}
}
},
"allow-recreate": {
"type": "boolean",
"title": "Override collections",
"description": "If checked and there are existing collections with the same names, they will be dropped and recreated. If unchecked, the exception is thrown in such situation."
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-milvus-create-collections"
],
"default": "argo-milvus-create-collections",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Word2Vec Model Training (deprecated)",
"description": "Trains a shallow neural model, and projects each document onto this vector embedding space. Deprecated as of Fusion 5.2.0 and will be removed in a future release.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"outputCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"advanced",
"code/json",
"lengthy"
]
},
"withIdf": {
"type": "boolean",
"title": "IDF Weighting",
"description": "Weight vector components based on inverse document frequency",
"default": true,
"hints": [
"advanced"
]
},
"w2vDimension": {
"type": "integer",
"title": "Embedding Dimension",
"description": "Word-vector dimensionality to represent text",
"default": 50,
"hints": [
"dummy"
],
"minimum": 0,
"exclusiveMinimum": false
},
"w2vWindowSize": {
"type": "integer",
"title": "Window Size",
"description": "The window size (context words from [-window, window]) for word2vec",
"default": 5,
"hints": [
"dummy"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxSentenceLength": {
"type": "integer",
"title": "Max Sentence Length",
"description": "Sets the maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength` size.",
"default": 1000,
"hints": [
"dummy"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxIter": {
"type": "integer",
"title": "Max Iterations",
"description": "Maximum number of iterations of the word2vec training",
"default": 1,
"hints": [
"advanced"
]
},
"w2vStepSize": {
"type": "number",
"title": "Step Size",
"description": "Training parameter for word2vec convergence (change at your own peril)",
"default": 0.025,
"hints": [
"advanced"
],
"minimum": 0.005,
"exclusiveMinimum": false
},
"minDF": {
"type": "number",
"title": "Minimum Term Document Frequency",
"description": "To be kept, terms must occur in at least this number of documents (if > 1.0), or at least this fraction of documents (if <= 1.0)",
"default": 0,
"hints": [
"advanced"
]
},
"maxDF": {
"type": "number",
"title": "Max Term Document Frequency",
"description": "To be kept, terms must occur in no more than this number of documents (if > 1.0), or no more than this fraction of documents (if <= 1.0)",
"default": 1,
"hints": [
"advanced"
]
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"predictedLabelField": {
"type": "string",
"title": "Word2Vec Feature Field",
"description": "Solr field which will contain vector features when the word2vec model is applied to documents",
"default": "w2vFeatures",
"hints": [
"hidden"
]
},
"serializeAsMleap": {
"type": "boolean",
"title": "Serialize as Mleap Bundle",
"description": "Serialize the output model as Mleap Bundle",
"default": true,
"hints": [
"hidden"
]
},
"minSparkPartitions": {
"type": "integer",
"title": "Minimum Number of Spark Partitions",
"description": "Minimum number of Spark partitions for training job.",
"default": 200,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"overwriteExistingModel": {
"type": "boolean",
"title": "Overwrite existing model",
"description": "If a model exists in the model store, overwrite when this job runs",
"default": true,
"hints": [
"advanced"
]
},
"outputField": {
"type": "string",
"title": "Output Field",
"description": "Solr field which will contain terms which the word2vec model considers are related to the input",
"default": "related_terms_txt"
},
"uidField": {
"type": "string",
"title": "ID Field Name",
"description": "Field containing the unique ID for each document",
"minLength": 1
},
"numRelatedTerms": {
"type": "integer",
"title": "Number of Related Words",
"description": "For each collection of input words, find this many word2vec-related words",
"default": 10,
"minimum": 1,
"exclusiveMinimum": false
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"word2vec"
],
"default": "word2vec",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"predictedLabelField",
"uidField",
"outputField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"w2vDimension",
"w2vWindowSize",
"w2vMaxIter",
"w2vMaxSentenceLength",
"w2vStepSize",
"withIdf",
"maxDF",
"minDF",
"norm",
"numRelatedTerms"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Parameterized SQL Aggregation",
"description": "A SQL aggregation job where users provide parameters to be injected into a built-in SQL template at runtime.",
"required": [
"id",
"inputCollection",
"dataFormat",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"inputCollection": {
"type": "string",
"title": "Source Collection",
"description": "Collection containing documents to be aggregated."
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "The collection to write the aggregates to on output. Defaults to the input collection if not specified."
},
"notes": {
"type": "string",
"title": "Notes",
"description": "A short description about this job.",
"hints": [
"lengthy"
]
},
"parameters": {
"type": "array",
"title": "SQL Parameters",
"description": "Parameters bound on the SQL template at runtime.",
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"selectQuery": {
"type": "string",
"title": "Query",
"description": "The query to select the desired signals. If not set then '*:*' will be used, or equivalent.",
"default": "*:*",
"hints": [
"advanced"
]
},
"timeRange": {
"type": "string",
"title": "Time Range",
"description": "The time range to select signals on.",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceCatchup": {
"type": "boolean",
"title": "Aggregate New and Merge with Existing",
"description": "If checked, only aggregate new signals created since the last time the job was successfully run. If there is a record of such previous run then this overrides the starting time of time range set in 'timeRange' property. If unchecked, then all matching signals are aggregated and any previously aggregated docs are deleted to avoid double counting.",
"default": true,
"hints": [
"advanced"
]
},
"sql": {
"type": "string",
"title": "SQL",
"description": "Use SQL to perform the aggregation. You do not need to include a time range filter in the WHERE clause as it gets applied automatically before executing the SQL statement.",
"hints": [
"advanced",
"code/sql"
],
"minLength": 1
},
"rollupSql": {
"type": "string",
"title": "Rollup SQL",
"description": "Use SQL to perform a rollup of previously aggregated docs. If left blank, the aggregation framework will supply a default SQL query to rollup aggregated metrics.",
"hints": [
"advanced",
"code/sql"
],
"minLength": 1
},
"sourceRemove": {
"type": "boolean",
"title": "Remove Source",
"description": "If true, the processed source signals will be removed after aggregation. Default is false.",
"default": false,
"hints": [
"advanced"
]
},
"referenceTime": {
"type": "string",
"title": "Reference Time",
"description": "Timestamp to use for computing decays and to determine the value of NOW.",
"hints": [
"advanced"
],
"format": "date-time"
},
"hiddenParameters": {
"type": "array",
"title": "Hidden Parameters",
"description": "Additional settings used to tune the underlying aggregation job.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Additional configuration settings to fine-tune how input records are read for this aggregation.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"skipCheckEnabled": {
"type": "boolean",
"title": "Job Skip Check Enabled?",
"description": "If the catch-up flag is enabled and this field is checked, the job framework will execute a fast Solr query to determine if this run can be skipped.",
"default": true,
"hints": [
"advanced"
]
},
"useNaturalKey": {
"type": "boolean",
"title": "Use Natural Key?",
"description": "Use a natural key provided in the raw signals data for aggregation, rather than relying on Solr UUIDs. Migrated aggregations jobs from Fusion 4 will need this set to false.",
"default": true,
"hints": [
"advanced"
]
},
"optimizeSegments": {
"type": "integer",
"title": "Optimize Segments",
"description": "If set to a value above 0, the aggregator job will optimize the resulting Solr collection into this many segments",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"sql_template"
],
"default": "sql_template",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Token and Phrase Spell Correction (Deprecated)",
"description": "Use this job to compute token and phrase level spell correction which you can use in your synonym list. This job is deprecated.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input Collection",
"description": "Collection containing search strings and event counts. Should ideally be the signals collection.If an aggregation collection is being used, update the filter query in the advanced options",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Input Field",
"description": "Field containing search strings.",
"default": "query",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Data filter query",
"description": "Solr query to use when loading training data if using Solr (e.g. type:click OR type:response), Spark SQL expression for all other data sources",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Collection to store misspelling and correction pairs. Defaults to the query_rewrite_staging collection for the application.",
"hints": [
"dummy"
]
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"stopwordsBlobName": {
"type": "string",
"title": "Stopwords blob (Deprecated)",
"description": "Name of stopwords blob resource (.txt or .rtf file uploaded to the blob store). This field is marked for deprecation. Going forward, please specify the stopwords blob name as a luceneSchema property.",
"hints": [
"advanced"
],
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
},
"dictionaryCollection": {
"type": "string",
"title": "Dictionary Collection",
"description": "Solr Collection containing dictionary with correct spellings. E.g., product catalog."
},
"dictionaryField": {
"type": "string",
"title": "Dictionary Field",
"description": "Solr field containing dictionary text. Multiple fields can be specified using the format: field1,field2 etc."
},
"countField": {
"type": "string",
"title": "Count Field",
"description": "Solr field containing query count",
"default": "count_i"
},
"mainType": {
"type": "string",
"title": "Main Event Type",
"description": "The main signal event type (e.g. click) that the job is based on if input is signal data. E.g., if main type is click, then head and tail tokens/phrases are defined by the number of clicks.",
"default": "click"
},
"filterType": {
"type": "string",
"title": "Filtering Event Type",
"description": "The secondary event type (e.g. response) that can be used for filtering out rare searches.Note: In order to use this `response` default value, please make sure you have type:response in the input collection.If there is no need to filter on number of searches, please leave this parameter blank.",
"default": "response"
},
"signalTypeField": {
"type": "string",
"title": "Field Name of Signal Type",
"description": "The field name of signal type in the input collection.",
"default": "type",
"hints": [
"advanced"
]
},
"minCountMain": {
"type": "integer",
"title": "Minimum Main Event Count",
"description": "Minimum number of main events (e.g. clicks after aggregation) necessary for the query to be considered. The job will only analyze queries with clicks greater or equal to this number.",
"default": 1
},
"minCountFilter": {
"type": "integer",
"title": "Minimum Filtering Event Count",
"description": "Minimum number of filtering events (e.g. searches after aggregation) necessary for the query to be considered. The job will only analyze queries that were issued greater or equal to this number of times.",
"default": 10
},
"dictionaryDataFilterQuery": {
"type": "string",
"title": "Dictionary Data Filter Query",
"description": "Solr query to use when loading dictionary data",
"default": "*:*",
"hints": [
"advanced"
]
},
"minPrefix": {
"type": "integer",
"title": "Minimum Prefix Match",
"description": "The minimum number of matches on starting characters. Note: Setting it to 0 may largely increase running time. ",
"default": 1,
"minimum": 0,
"exclusiveMinimum": false
},
"minMispellingLen": {
"type": "integer",
"title": "Minimum Length of Misspelling",
"description": "The minimum length of misspelling to check. Smaller number may lead to problematic corrections. E.g., It is hard to find the right correction for a two or three character string. ",
"default": 5,
"minimum": 1,
"exclusiveMinimum": false
},
"maxDistance": {
"type": "integer",
"title": "Maximum Edit Distance",
"description": "The maximum edit distance between related token/phrases you are interested in. Large number leads to longer correction list but may add lower quality corrections. ",
"default": 2,
"minimum": 1,
"exclusiveMinimum": false
},
"lastCharMatchBoost": {
"type": "number",
"title": "Last Character Match Boost",
"description": "When there are multiple possible corrections, we rank corrections based on: editDistBoost / editDist + correctionCountBoost * log(correctionCount) + lastCharMatchBoost * lastCharMatch + soundMatchBoost * soundexMatch. Big number puts more weight on last character match between misspelling and correction strings",
"default": 1,
"hints": [
"advanced"
]
},
"soundMatchBoost": {
"type": "number",
"title": "Sound Match Boost",
"description": "When there are multiple possible corrections, we rank corrections based on: editDistBoost / editDist + correctionCountBoost * log(correctionCount) + lastCharMatchBoost * lastCharMatch + soundMatchBoost * soundexMatch. Big number puts more weight on soundex match between misspelling and correction strings",
"default": 3,
"hints": [
"advanced"
]
},
"correctCntBoost": {
"type": "number",
"title": "Correction Count Boost",
"description": "When there are multiple possible corrections, we rank corrections based on: editDistBoost / editDist + correctionCountBoost * log(correctionCount) + lastCharMatchBoost * lastCharMatch + soundMatchBoost * soundexMatch. Big number puts more weight on count of correction string occurrences.",
"default": 2,
"hints": [
"advanced"
]
},
"editDistBoost": {
"type": "number",
"title": "Edit Distance Boost",
"description": "When there are multiple possible corrections, we rank corrections based on: editDistBoost / editDist + correctionCountBoost * log(correctionCount) + lastCharMatchBoost * lastCharMatch + soundMatchBoost * soundexMatch. Big number puts more weight on shorter edit distance.",
"default": 2,
"hints": [
"advanced"
]
},
"signalDataIndicator": {
"type": "boolean",
"title": "Input is Signal Data",
"description": "The input dataset that the spell checker based on is signal data. If the input data is content document rather than signal, please uncheck.",
"default": true
},
"analyzerConfigQuery": {
"type": "string",
"title": "Lucene Analyzer Schema for Processing Queries",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [ { \"name\": \"LetterTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"letter\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"LetterTokLowerStem\" } ]}",
"hints": [
"lengthy",
"code/json"
],
"minLength": 1
},
"analyzerConfigDictionary": {
"type": "string",
"title": "Lucene Analyzer Schema for Processing Dictionary",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [ { \"name\": \"LetterTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"letter\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"LetterTokLowerStem\" } ]}",
"hints": [
"lengthy",
"code/json"
],
"minLength": 1
},
"correctionThreshold": {
"type": "number",
"title": "Correct Spelling Threshold",
"description": "The count of occurrence ABOVE which the token/phrases are likely to be corrected spellings. Note that this number can be either fraction (<1.0) to denote a quantile based on count number distribution (shown in the log) or a number (>1.0) to denote the absolute count. A big number may cause performance issues.",
"default": 0.8,
"hints": [
"advanced"
]
},
"misspellingThreshold": {
"type": "number",
"title": "Misspelling Threshold",
"description": "The count of occurrence BELOW which the token/phrases are likely to be misspellings. Note that this number can be either fraction (<1.0) to denote a quantile based on count number distribution (shown in the log) or a number (>1.0) to denote the absolute count.",
"default": 0.8,
"hints": [
"advanced"
]
},
"lenScale": {
"type": "integer",
"title": "Edit Dist vs String Length Scale",
"description": "A scaling factor used to normalize the length of query string to compare against edit distances. The filtering is based on if edit_dist <= string_length/length_scale. A large value for this factor leads to a shorter correction list. A small value leads to a longer correction list but may add lower quality corrections.",
"default": 5,
"hints": [
"advanced"
]
},
"corMisRatio": {
"type": "number",
"title": "Correction and Misspelling Count Ratio",
"description": "Ratio between correction occurrence count and misspelling occurrence count. Pairs with ratio less than or equal to this number will be filtered. Big number leads to shorter correction list and may have higher quality corrections.",
"default": 3,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"enableAutoPublish": {
"type": "boolean",
"title": "Enable auto-publishing",
"description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
"default": false,
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"tokenPhraseSpellCorrection"
],
"default": "tokenPhraseSpellCorrection",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed",
"signalDataIndicator"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"signalTypeField",
"mainType",
"filterType",
"countField"
]
},
{
"label": "Boost Parameters",
"properties": [
"lastCharMatchBoost",
"soundMatchBoost",
"correctCntBoost",
"editDistBoost"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"minCountMain",
"minCountFilter",
"correctionThreshold",
"misspellingThreshold",
"lenScale",
"corMisRatio",
"maxDistance",
"minMispellingLen",
"minPrefix"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfigQuery"
]
},
{
"label": "Misc. Parameters",
"properties": [
"stopwordsBlobName",
"dictionaryCollection",
"dictionaryField",
"dictionaryDataFilterQuery",
"analyzerConfigDictionary"
]
}
]
},
{
"type": "object",
"title": "SQL-Based Experiment Metric",
"description": "This job is created by an experiment in order to calculate an objective",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"experimentId": {
"type": "string",
"title": "Experiment ID",
"hints": [
"readonly"
]
},
"metricName": {
"type": "string",
"title": "Objective name",
"hints": [
"readonly"
]
},
"notes": {
"type": "string",
"title": "Notes",
"description": "A short description about this job.",
"hints": [
"lengthy"
]
},
"sql": {
"type": "string",
"title": "SQL",
"hints": [
"readonly"
]
},
"experiment": {
"type": "object",
"title": "Experiment",
"required": [
"id",
"baseSignalsCollection",
"metrics"
],
"properties": {
"id": {
"type": "string",
"title": "ID",
"maxLength": 128,
"pattern": "^[A-Za-z0-9_\\-]+$"
},
"description": {
"type": "string",
"title": "Description"
},
"uniqueIdParameter": {
"type": "string",
"title": "Unique ID Parameter",
"description": "The name of the request parameter containing the user ID",
"default": "userId"
},
"baseSignalsCollection": {
"type": "string",
"title": "Base Collection for Signals",
"description": "Signals resulting from requests that flow through this experiment will go into the signal collection associated with this collection",
"minLength": 1,
"pattern": "^[A-Za-z0-9_\\-]+$"
},
"variants": {
"type": "array",
"title": "Variants",
"description": "Specify what varies in this variant, and optionally change the traffic weight",
"items": {
"type": "object",
"properties": {
"id": {
"type": "string",
"title": "Variant id",
"hints": [
"hidden"
],
"maxLength": 128,
"pattern": "^[A-Za-z0-9_\\-]+$"
},
"name": {
"type": "string",
"title": "Name"
},
"queryPipeline": {
"type": "string",
"title": "Query Pipeline"
},
"params": {
"type": "array",
"title": "Query Params",
"description": "URL parameters to add to queries using this variant",
"items": {
"type": "object",
"required": [
"key",
"value"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
},
"policy": {
"type": "string",
"title": "Update Policy",
"enum": [
"replace",
"append",
"remove",
"default"
],
"default": "append"
}
}
}
},
"collection": {
"type": "string",
"title": "Collection"
},
"weight": {
"type": "number",
"title": "Weight",
"description": "Proportion of traffic to send to this variant. Higher values mean proportionally more traffic will be routed to this variant",
"default": 1,
"minimum": 0.01,
"exclusiveMinimum": false
}
}
}
},
"metrics": {
"type": "array",
"title": "Objectives",
"description": "Metrics that will be used to evaluate the variants",
"minItems": 1,
"items": {
"type": "object",
"required": [
"name"
],
"properties": {
"name": {
"type": "string",
"title": "Name"
},
"description": {
"type": "string",
"title": "Description"
},
"primary": {
"type": "boolean",
"title": "Primary",
"description": "Whether this metric is the primary metric used for evaluating the variants (the 'OEC')."
},
"jobId": {
"type": "string",
"title": "Associated Spark Job ID",
"hints": [
"hidden"
]
},
"binary": {
"type": "boolean",
"title": "Binary-valued metric",
"description": "Whether this metric measures a Bernoulli trial (clicks, cart adds, etc) or a continuous-valued event.",
"hints": [
"hidden"
]
}
}
}
},
"enabled": {
"type": "boolean",
"title": "Enabled",
"default": true,
"hints": [
"readonly"
]
},
"startTimestamp": {
"type": "string",
"title": "Start Date",
"description": "When the experiment last started",
"hints": [
"readonly"
],
"format": "date-time"
},
"runId": {
"type": "string",
"title": "Run Identifier",
"hints": [
"readonly",
"hidden"
]
},
"automaticallyAdjustTraffic": {
"type": "boolean",
"title": "Automatically Adjust Weights Between Variants",
"default": false
}
},
"hints": [
"hidden"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"experiment_sql"
],
"default": "experiment_sql",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Custom Spark Job",
"description": "Use this job when you want to run a custom JAR on Spark",
"required": [
"id",
"klassName",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"klassName": {
"type": "string",
"title": "Class name",
"description": "Fully-qualified name of the Java/Scala class to invoke"
},
"submitArgs": {
"type": "array",
"title": "Script args",
"description": "Additional options to pass to the application when running this job.",
"items": {
"type": "string"
}
},
"script": {
"type": "string",
"title": "Scala Script",
"description": "Use this text field if you want to override the default behaviour, which is to run className.main(args)",
"hints": [
"code/scala",
"lengthy",
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"custom_spark_scala_job"
],
"default": "custom_spark_scala_job",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Delete Indexes in Milvus (deprecated)",
"description": "Deletes specified indexes in Milvus collections",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"collections": {
"type": "array",
"title": "Collections",
"description": "List of collections in Milvus where indexes should be dropped.",
"items": {
"type": "string"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-milvus-delete-indexes"
],
"default": "argo-milvus-delete-indexes",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Document Clustering",
"description": "Use this job when you want to cluster a set of documents and attach cluster labels based on topics.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"outputCollection",
"uidField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing documents to be clustered",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to",
"minLength": 1
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"uidField": {
"type": "string",
"title": "ID Field Name",
"description": " Field containing the unique ID for each document.",
"default": "id",
"minLength": 1
},
"clusterIdField": {
"type": "string",
"title": "Output Field Name for Cluster Id",
"description": "Output field name for unique cluster id.",
"default": "cluster_id"
},
"clusterLabelField": {
"type": "string",
"title": "Detected Cluster Keywords Field Name",
"description": "Output field name for top frequent terms that are (mostly) unique for each cluster.",
"default": "cluster_label"
},
"freqTermField": {
"type": "string",
"title": "Top Frequent Terms Field Name",
"description": "Output field name for top frequent terms in each cluster. These may overlap with other clusters.",
"default": "freq_terms"
},
"distToCenterField": {
"type": "string",
"title": "Output Field Name for doc distance to its cluster center",
"description": "Output field name for doc distance to its corresponding cluster center (measure how representative the doc is).",
"default": "dist_to_center"
},
"minDF": {
"type": "number",
"title": "Min Doc Support",
"description": "Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 5
},
"maxDF": {
"type": "number",
"title": "Max Doc Support",
"description": "Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 0.5
},
"kExact": {
"type": "integer",
"title": "Number of Clusters",
"description": "Exact number of clusters.",
"default": 0
},
"kMax": {
"type": "integer",
"title": "Max Possible Number of Clusters",
"description": "Max possible number of clusters.",
"default": 20
},
"kMin": {
"type": "integer",
"title": "Min Possible Number of Clusters",
"description": "Min possible number of clusters.",
"default": 2
},
"docLenTrim": {
"type": "boolean",
"title": "Find Extreme Length Doc Flag",
"description": " Whether to separate out docs with extreme lengths.",
"default": true
},
"outlierTrim": {
"type": "boolean",
"title": "Find Outliers Flag",
"description": " Whether to perform outlier detection.",
"default": true
},
"shortLen": {
"type": "number",
"title": "Length Threshold for Short Doc",
"description": "Length threshold to define short document. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
"default": 5
},
"longLen": {
"type": "number",
"title": "Length Threshold for Long Doc",
"description": "Length threshold to define long document. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
"default": 0.99
},
"numKeywordsPerLabel": {
"type": "integer",
"title": "Number of Keywords for Each Cluster",
"description": "Number of Keywords needed for labeling each cluster.",
"default": 5
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"w2vDimension": {
"type": "integer",
"title": "Word2Vec Dimension",
"description": "Word-vector dimensionality to represent text (choose > 0 to use, suggested dimension ranges: 100~150)",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec Window Size",
"description": "The window size (context words from [-window, window]) for word2vec",
"default": 8,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"patternreplace\", \"pattern\": \"^[\\\\d.]+$\", \"replacement\": \" \", \"replace\": \"all\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"code/json",
"advanced",
"lengthy"
],
"minLength": 1
},
"clusteringMethod": {
"type": "string",
"title": "Clustering Method (hierarchical or kmeans)",
"description": "Choose between hierarchical vs kmeans clustering.",
"default": "hierarchical",
"hints": [
"advanced"
]
},
"outlierK": {
"type": "integer",
"title": "Number of outlier groups",
"description": "Number of clusters to help find outliers.",
"default": 10,
"hints": [
"advanced"
]
},
"outlierThreshold": {
"type": "number",
"title": "Outlier cutoff",
"description": "Identify as outlier group if less than this percent of total documents. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
"default": 0.01,
"hints": [
"advanced"
]
},
"minDivisibleSize": {
"type": "number",
"title": "Minimum divisible cluster size",
"description": "Clusters must have at least this many documents to be split further. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
"default": 0,
"hints": [
"advanced"
]
},
"kDiscount": {
"type": "number",
"title": "Discount for K when choosing number of clusters",
"description": "Applies a discount to help favor large/small K (number of clusters). A smaller value pushes K to assume a higher value within the [min, max] K range.",
"default": 0.7,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"doc_clustering"
],
"default": "doc_clustering",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"uidField",
"clusterIdField",
"freqTermField",
"clusterLabelField",
"distToCenterField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"clusteringMethod",
"outlierTrim",
"outlierK",
"outlierThreshold",
"kExact",
"kMax",
"kMin",
"w2vDimension",
"w2vWindowSize",
"maxDF",
"minDF",
"norm",
"numKeywordsPerLabel",
"minDivisibleSize",
"kDiscount"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig",
"docLenTrim",
"longLen",
"shortLen"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Smart Answers Supervised Training",
"description": "Trains Smart Answers model on a supervised basis with pre-trained or trained embeddings and deploys the trained model to the ML Model Service",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"questionColName",
"answerColName",
"deployModelName",
"modelReplicas",
"modelBase",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"useAutoML": {
"type": "boolean",
"title": "Perform auto hyperparameter tuning",
"description": "Automatically tune hyperparameters (will take longer to train). Transformer models aren't used in this regime",
"default": false
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
"hints": [
"code/sql",
"advanced"
]
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"questionColName": {
"type": "string",
"title": "Question Field",
"description": "Name of the field containing questions",
"minLength": 1
},
"answerColName": {
"type": "string",
"title": "Answer Field",
"description": "Name of the field containing answers",
"minLength": 1
},
"weightColName": {
"type": "string",
"title": "Weight Field",
"description": "Name of the field to be used for weights",
"minLength": 1
},
"deployModelName": {
"type": "string",
"title": "Model Deployment Name",
"description": "Name of the model to be used for deployment (must be a valid lowercased DNS subdomain with no underscores)",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"testMode": {
"type": "boolean",
"title": "Test Mode",
"description": "If set to true, then the training will exit after the first iteration. Useful for ensuring that the end-to-end pipeline is working",
"default": false,
"hints": [
"hidden"
]
},
"modelReplicas": {
"type": "integer",
"title": "Model replicas",
"description": "How many replicas of the model should be deployed by Seldon Core",
"default": 1
},
"modelBase": {
"type": "string",
"title": "Model base",
"description": "Specify one of these custom embeddings: ['word_custom', 'bpe_custom'] or choose one of the included pre-trained embeddings / models.",
"enum": [
"word_custom",
"bpe_custom",
"word_en_300d_2M",
"bpe_en_300d_10K",
"bpe_en_300d_200K",
"bpe_ja_300d_100K",
"bpe_ko_300d_100K",
"bpe_zh_300d_50K",
"bpe_multi_300d_320K",
"distilbert_en",
"distilbert_multi",
"biobert_v1.1"
],
"default": "word_en_300d_2M"
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "The proportion of data to be sampled from the full dataset. Use a value between 0 and 1 for a proportion (e.g. 0.5 for 50%), or for a specific number of examples, use an integer larger than 1. Leave blank for no sampling",
"hints": [
"advanced"
]
},
"seed": {
"type": "integer",
"title": "Seed",
"description": "Random seed for sampling",
"default": 12345,
"hints": [
"hidden"
]
},
"minTokensNum": {
"type": "integer",
"title": "Minimum number of words in doc",
"description": "Drop document if the total words is lower than this value",
"default": 1,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"maxTokensNum": {
"type": "integer",
"title": "Maximum number of words in doc",
"description": "Drop document if the total words is greater than this value",
"default": 5000,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"lowerCases": {
"type": "boolean",
"title": "Lower case all words",
"description": "Whether to lower case all words in training, i.e. whether to treat upper case and lower case words equally. Only utilized for custom embeddings or for the default model base: word_en_300d_2M.",
"default": false
},
"maxVocabSize": {
"type": "integer",
"title": "Maximum vocabulary size",
"description": "Maximum number of words in vocabulary, words will be trimmed if frequency is too low. Only utilized for custom embeddings or for the default model base: word_en_300d_2M.",
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"w2vEpochs": {
"type": "integer",
"title": "Word2Vec training epochs",
"description": "Number of epochs to train custom word2vec embeddings",
"default": 15,
"hints": [
"advanced"
]
},
"w2vTextsCollection": {
"type": "string",
"title": "Texts data path",
"description": "Solr collection or cloud storage path which contains extra documents that will be used to get better vocabulary coverage as well as to train custom word embeddings if custom Model Base is specified."
},
"w2vTextColumns": {
"type": "string",
"title": "Texts collection fields",
"description": "Which fields in the text collection to use. If multiple fields, please separate them by comma, e.g. description_t,title_t."
},
"textsFormat": {
"type": "string",
"title": "Texts format",
"description": "The format of the texts training data - solr, parquet etc."
},
"w2vVectorSize": {
"type": "integer",
"title": "Size of word vectors",
"description": "Word-vector dimensionality to represent text (suggested dimension ranges: 100~300)",
"default": 150,
"hints": [
"advanced"
]
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec window size",
"description": "The window size (context words from [-window, window]) for Word2Vec",
"default": 8,
"hints": [
"advanced"
]
},
"valSize": {
"type": "number",
"title": "Validation sample size",
"description": "Proportion of the unique questions that should be used as validation samples. When val_size > 1, then that specific number of unique questions will be sampled rather than a proportion.",
"default": 0.1,
"minimum": 0.001,
"exclusiveMinimum": false
},
"maxLen": {
"type": "integer",
"title": "Max length",
"description": "Maximum length of text processed by the model. Texts longer than this value will be trimmed. This parameter is especially important for Transformer-based models as it affects training and inference time. Note that the maximum supported length for Transformer models is 512, so you can specify any value up to that. The default value is the max value between three times the STD of question lengths and two times the STD of answer lengths.",
"hints": [
"advanced"
]
},
"embSPDP": {
"type": "number",
"title": "Dropout ratio",
"description": "Fraction of input to drop with Dropout layer (from 0-1)",
"default": 0.3
},
"trainBatch": {
"type": "integer",
"title": "Training batch size",
"description": "Batch size during training. If left blank, this will be set automatically based on the input data"
},
"infBatch": {
"type": "integer",
"title": "Inference batch size used in validation",
"description": "Batch size during validation. If left blank, this will be set automatically based on the input data",
"hints": [
"advanced"
]
},
"rnnNamesList": {
"type": "string",
"title": "RNN function list",
"description": "List of layers of RNNs to be used, with possible values of lstm, gru. E.g. [\"lstm\", \"lstm\"]. This value will be automatically decided based on data if left blank"
},
"rnnUnitsList": {
"type": "string",
"title": "RNN function units list",
"description": "List of RNN layer units numbers, corresponding to RNN function list. E.g. 150, 150. This value will be automatically decided based on data if left blank"
},
"epochs": {
"type": "integer",
"title": "Number of epochs to be used in training"
},
"weightDecay": {
"type": "number",
"title": "Weight decay",
"description": "L2 penalty used in AdamW optimizer. Bigger values will provide stronger regularization. Default values are 0.0003 for RNN models and 0.01 for Transformer models."
},
"monitorPatience": {
"type": "integer",
"title": "Monitor patience",
"description": "Stop training if no improvement in metrics by this number of epochs"
},
"baseLR": {
"type": "number",
"title": "Base learning rate",
"description": "Base learning rate that should be used during training. Reasonable values are from 0.0001 to 0.003 depending on model base. It's better to use lower LR with Transformer models."
},
"minLR": {
"type": "number",
"title": "Minimum learning rate",
"description": "Minimum learning rate used during training. Reasonable values are from 0.00001 to 0.00003.",
"hints": [
"advanced"
]
},
"numWarmUpEpochs": {
"type": "integer",
"title": "Number of warm-up epochs",
"description": "Number of epochs used for the warm-up stage for learning rates. Reasonable values are from 0-4 epochs, usually 1-2 are used."
},
"numFlatEpochs": {
"type": "integer",
"title": "Number of flat epochs",
"description": "Number of epochs used in flat stage for learning rates. Reasonable value would be one-half of the epochs, so the other half will be with Cosine Annealing learning rate."
},
"extraTrainingArgs": {
"type": "string",
"title": "Extra training args for Python scripts",
"description": "Add any additional arguments for the Python training scripts in this field",
"hints": [
"hidden"
]
},
"monitorMetric": {
"type": "string",
"title": "Monitor metric",
"description": "The main metric at k that should be monitored to decide when to stop training. Possible metrics are: [\"map\", \"mrr\", \"recall\", \"precision\"]",
"default": "mrr@3"
},
"monitorMetricsList": {
"type": "string",
"title": "Metrics list",
"description": "List of evaluation metrics on validation data that will be printed in the log at the end of each epoch. Possible metrics are: [\"map\", \"mrr\", \"recall\", \"precision\"]",
"default": "[\"map\", \"mrr\", \"recall\"]"
},
"kList": {
"type": "string",
"title": "Metrics@k list",
"description": "The k retrieval position that will be used to compute for each metric",
"default": "[1,3,5]"
},
"numClusters": {
"type": "integer",
"title": "Number of clusters",
"description": "DEPRECATED: please, consider using Milvus for fast dense vector similarity search. Number of clusters to be used for fast dense vector retrieval. Note no clustering will be applied if this is set to 0. If left blank, cluster count will be inferred by the job depending on the data",
"default": 0,
"hints": [
"advanced"
]
},
"topKClusters": {
"type": "integer",
"title": "Top k of clusters to return",
"description": "How many closest clusters the model can find for each query. At retrieval time, all answers in top k nearest clusters will be returned and reranked",
"default": 10,
"hints": [
"advanced"
]
},
"unidecode": {
"type": "boolean",
"title": "Apply unicode decoding",
"description": "Use Unidecode library to transform Unicode input into ASCII transliterations. Only utilized for custom embeddings or for the default model base: word_en_300d_2M",
"default": true
},
"useMixedPrecision": {
"type": "string",
"title": "Use Mixed Precision",
"description": "Check this option to train a model with mixed precision support.This will only work if the node has a GPU. You'll only see a speed up on newer NVidia GPUs (Turing and later) with Transformer models.",
"enum": [
"auto",
"true",
"false"
],
"default": "auto",
"hints": [
"advanced"
]
},
"useLabelingResolution": {
"type": "boolean",
"title": "Use Labeling Resolution",
"description": "Check this to determine similar questions and similar answers via labeling resolution and graph connected components. Does not work well with noisy data like eCommerce queries. But helps with FAQ / QnA data.",
"default": false
},
"useLayerNorm": {
"type": "boolean",
"title": "Use Layer Norm",
"description": "Check this to use layer norm for pooling.",
"default": false,
"hints": [
"advanced"
]
},
"globalPoolType": {
"type": "string",
"title": "Global Pool Type",
"description": "Determines how token vectors should be aggregated to obtain final content vector. Must be one of: [avg, max, self_attention].",
"enum": [
"avg",
"max",
"self_attention"
],
"default": "self_attention",
"hints": [
"advanced"
]
},
"embTrainable": {
"type": "boolean",
"title": "Fine-tune Token Embeddings",
"description": "Choose this to fine-tune token embeddings during model training. Tends to work well with eCommerce data.",
"default": false,
"hints": [
"advanced"
]
},
"eps": {
"type": "number",
"title": "Eps",
"description": "Epsilon is the AdamW optimizer. By default 1e-8 is used for RNN models and 1e-6 is used for Transformer models.",
"hints": [
"advanced"
]
},
"maxGradNorm": {
"type": "number",
"title": "Max Grad Norm",
"description": "Max norm used for gradients clipping. By default it’s not used for RNN models but 1.0 value is used for Transformer models.",
"hints": [
"advanced"
]
},
"useXbm": {
"type": "string",
"title": "Use Cross-batch memory",
"description": "Stores encoded representations of previous batches in memory for better negative examples sampling. Works well for Transformer models. Leave this at 'auto' to let the training module determine this.",
"enum": [
"auto",
"true",
"false"
],
"default": "auto",
"hints": [
"advanced"
]
},
"xbmMemorySize": {
"type": "integer",
"title": "Cross-batch memory size",
"description": "Number of examples from the previous batches that are stored in memory. The default size for Transformer models is 256.",
"hints": [
"advanced"
]
},
"xbmEpochActivation": {
"type": "integer",
"title": "Cross-batch epoch activation",
"description": "After which epoch cross-batch memory should be activated. By default it’s activated after the first epoch for Transformer models.",
"hints": [
"advanced"
]
},
"evalAnnIndex": {
"type": "string",
"title": "Eval ANN index",
"description": "Choose this to use Approximate Nearest Neighbor search during evaluation. For big datasets it can speed up the evaluation time with minimum loss in accuracy, for small datasets it will most likely make it slower.",
"enum": [
"auto",
"true",
"false"
],
"default": "auto",
"hints": [
"advanced"
]
},
"distance": {
"type": "string",
"title": "Distance",
"description": "Vectors distance/similarity that should be used during training and in the pipelines. Choose one of: ['cosine_similarity', 'dot_product_similarity', 'euclidean_distance'].",
"enum": [
"cosine_similarity",
"dot_product_similarity",
"euclidean_distance"
],
"default": "cosine_similarity",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-qna-supervised"
],
"default": "argo-qna-supervised",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"trainingDataFilterQuery",
"seed",
"trainingSampleFraction",
"questionColName",
"answerColName",
"weightColName",
"w2vTextsCollection",
"w2vTextColumns",
"textsFormat",
"deployModelName",
"modelReplicas",
"secretName"
]
},
{
"label": "Data Preprocessing",
"properties": [
"useLabelingResolution",
"unidecode",
"lowerCases",
"minTokensNum",
"maxTokensNum",
"maxVocabSize"
]
},
{
"label": "Custom Embeddings Initialization",
"properties": [
"w2vEpochs",
"w2vVectorSize",
"w2vWindowSize"
]
},
{
"label": "Evaluation Parameters",
"properties": [
"valSize",
"monitorMetric",
"monitorPatience",
"monitorMetricsList",
"kList",
"evalAnnIndex"
]
},
{
"label": "General Encoder Parameters",
"properties": [
"embTrainable",
"maxLen",
"globalPoolType",
"useLayerNorm",
"numClusters",
"topKClusters"
]
},
{
"label": "RNN Encoder Parameters",
"properties": [
"embSPDP",
"rnnNamesList",
"rnnUnitsList"
]
},
{
"label": "Training Parameters",
"properties": [
"epochs",
"trainBatch",
"infBatch",
"baseLR",
"numWarmUpEpochs",
"numFlatEpochs",
"minLR",
"weightDecay",
"distance",
"eps",
"maxGradNorm",
"useMixedPrecision",
"useXbm",
"xbmMemorySize",
"xbmEpochActivation"
]
}
]
},
{
"type": "object",
"title": "Phrase Extraction (Deprecated)",
"description": "Use this job when you want to identify statistically significant phrases in your content. This job is deprecated.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"analyzerConfig",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 8180,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store extracted phrases; defaults to the query_rewrite_staging collection for the associated app."
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"ngramSize": {
"type": "integer",
"title": "Ngram Size",
"description": "The number of words in the ngram you want to consider for the sips.",
"default": 3,
"maximum": 5,
"exclusiveMaximum": false,
"minimum": 2,
"exclusiveMinimum": false
},
"minmatch": {
"type": "integer",
"title": "Minimum Count",
"description": "The number of times a phrase must exist to be considered. NOTE: if input is non signal data, please reduce the number to e.g. 5.",
"default": 100,
"minimum": 1,
"exclusiveMinimum": false
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Text Analyzer",
"description": "The style of text analyzer you would like to use.",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"lengthy",
"code/json"
]
},
"attachPhrases": {
"type": "boolean",
"title": "Extract Key Phrases from Input Text",
"description": "Checking this will cause the job to associate extracted phrases from each source doc. and write them back to the output collection. If input data is signals, it is suggested to turn this option off. Also, currently it is not allowed to check this option while attempting to write to a _query_rewrite_staging collection.",
"default": false,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"minLikelihood": {
"type": "number",
"title": "Minimum Likelihood Score",
"description": "Phrases below this threshold will not be written in the output of this job.",
"hints": [
"advanced"
]
},
"enableAutoPublish": {
"type": "boolean",
"title": "Enable auto-publishing",
"description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
"default": false,
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"sip"
],
"default": "sip",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"minmatch",
"ngramSize"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
}
]
},
{
"type": "object",
"title": "Parallel Bulk Loader",
"description": "Use this job when you want to load data into Fusion from a SparkSQL compliant datasource, and send this data to any Spark supported datasource (Solr/Index Pipeline/S3/GCS/...).",
"required": [
"id",
"format",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"format": {
"type": "string",
"title": "Format",
"description": "Specifies the input data source format; common examples include: parquet, json, textinputformat"
},
"path": {
"type": "string",
"title": "Path",
"description": "Path to load; for data sources that support multiple paths, separate by commas"
},
"streaming": {
"type": "object",
"title": "Streaming",
"required": [
"enableStreaming"
],
"properties": {
"enableStreaming": {
"type": "boolean",
"title": "Enable Streaming",
"description": "Stream data from input source to output Solr collection"
},
"outputMode": {
"type": "string",
"title": "Output mode",
"description": "Specifies the output mode for streaming. E.g., append (default), complete, update",
"enum": [
"append",
"complete",
"update"
],
"default": "append"
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options passed to the data source to configure the read operation; options differ for every data source so refer to the documentation for more information.",
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to send the documents loaded from the input data source."
},
"outputIndexPipeline": {
"type": "string",
"title": "Send to Index Pipeline",
"description": "Send the documents loaded from the input data source to an index pipeline instead of going directly to Solr."
},
"outputParser": {
"type": "string",
"title": "Send to Parser",
"description": "Parser to send the documents to while sending to index pipeline. (Defaults to same as index pipeline)",
"hints": [
"advanced"
]
},
"defineFieldsUsingInputSchema": {
"type": "boolean",
"title": "Define Fields in Solr?",
"description": "If true, define fields in Solr using the input schema; if a SQL transform is defined, the fields to define are based on the transformed DataFrame schema instead of the input.",
"default": true,
"hints": [
"advanced"
]
},
"atomicUpdates": {
"type": "boolean",
"title": "Send as Atomic Updates?",
"description": "Send documents to Solr as atomic updates; only applies if sending directly to Solr and not an index pipeline.",
"default": false,
"hints": [
"advanced"
]
},
"timestampFieldName": {
"type": "string",
"title": "Timestamp Field Name",
"description": "Name of the field that holds a timestamp for each document; only required if using timestamps to filter new rows from the input source.",
"hints": [
"advanced"
]
},
"clearDatasource": {
"type": "boolean",
"title": "Clear Existing Documents",
"description": "If true, delete any documents indexed in Solr by previous runs of this job. Default is false.",
"default": false,
"hints": [
"advanced"
]
},
"outputPartitions": {
"type": "integer",
"title": "Output Partitions",
"description": "Partition the input DataFrame into partitions before writing out to Solr or Fusion",
"hints": [
"advanced"
]
},
"optimizeOutput": {
"type": "integer",
"title": "Optimize",
"description": "Optimize the Solr collection down to the specified number of segments after writing to Solr.",
"hints": [
"advanced"
]
},
"cacheAfterRead": {
"type": "boolean",
"title": "Cache After Read",
"description": "Cache input data in memory (and disk as needed) after reading; default is false, setting to true may help stability of the job by reading all data from the input source first before transforming or writing to Solr. This could make the job run slower as it adds an intermediate write operation.",
"default": false,
"hints": [
"hidden"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output. For output formats other than solr or index-pipeline, format and path options can be specified here",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"transformScala": {
"type": "string",
"title": "Transform Scala",
"description": "Optional Scala script used to transform the results returned by the data source before indexing. You must define your transform script in a method with signature: def transform(inputDF: Dataset[Row]) : Dataset[Row]",
"hints": [
"advanced",
"lengthy",
"code/scala"
]
},
"mlModelId": {
"type": "string",
"title": "Spark ML PipelineModel ID",
"description": "The ID of the Spark ML PipelineModel stored in the Fusion blob store.",
"hints": [
"advanced"
],
"reference": "blob",
"blobType": "model:ml-model"
},
"transformSql": {
"type": "string",
"title": "Transform SQL",
"description": "Optional SQL used to transform the results returned by the data source before indexing. The input DataFrame returned from the data source will be registered as a temp table named '_input'. The Scala transform is applied before the SQL transform if both are provided, which allows you to define custom UDFs in the Scala script for use in your transformation SQL.",
"hints": [
"advanced",
"lengthy",
"code/sql"
]
},
"shellOptions": {
"type": "array",
"title": "Spark Shell Options",
"description": "Additional options to pass to the Spark shell when running this job.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"templateParams": {
"type": "array",
"title": "Interpreter Params",
"description": "Bind the key/values to the script interpreter",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"continueAfterFailure": {
"type": "boolean",
"title": "Continue after index failure",
"description": "If set to true, when a failure occurs when sending a document through an index pipeline, the job will continue onto the next document instead of failing",
"default": false,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"parallel-bulk-loader"
],
"default": "parallel-bulk-loader",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Outlier Detection",
"description": "Use this job when you want to find outliers from a set of documents and attach labels for each outlier group.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"uidField",
"outputCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing documents to be clustered",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to",
"minLength": 1
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"outlierGroupIdField": {
"type": "string",
"title": "Output Field Name for Outlier Group Id",
"description": "Output field name for unique outlier group id.",
"default": "outlier_group_id"
},
"outlierGroupLabelField": {
"type": "string",
"title": "Top Unique Terms Field Name",
"description": "Output field name for top frequent terms that are (mostly) unique for each outlier group as computed based on TF-IDF and group Id.",
"default": "outlier_group_label"
},
"outputOutliersOnly": {
"type": "boolean",
"title": "Only save outliers?",
"description": "If true, only outliers are saved in the output collection, otherwise, the whole dataset is saved.",
"default": false
},
"uidField": {
"type": "string",
"title": "ID Field Name",
"description": " Field containing the unique ID for each document.",
"default": "id",
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"lengthy",
"code/json"
],
"minLength": 1
},
"freqTermField": {
"type": "string",
"title": "Top Frequent Terms Field Name",
"description": "Output field name for top frequent terms in each cluster. These may overlap with other clusters.",
"default": "freq_terms"
},
"distToCenterField": {
"type": "string",
"title": "Output Field Name for doc distance to its cluster center",
"description": "Output field name for doc distance to its corresponding cluster center (measure how representative the doc is).",
"default": "dist_to_center"
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"minDF": {
"type": "number",
"title": "Min Doc Support",
"description": "Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 5
},
"maxDF": {
"type": "number",
"title": "Max Doc Support",
"description": "Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 0.75
},
"numKeywordsPerLabel": {
"type": "integer",
"title": "Number of Keywords for Each Cluster",
"description": "Number of Keywords needed for labeling each cluster.",
"default": 5
},
"outlierK": {
"type": "integer",
"title": "Number of outlier groups",
"description": "Number of clusters to help find outliers.",
"default": 10,
"hints": [
"advanced"
]
},
"outlierThreshold": {
"type": "number",
"title": "Outlier cutoff",
"description": "Identify as outlier group if less than this percent of total documents. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 0.01,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"outlier_detection"
],
"default": "outlier_detection",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed",
"outputOutliersOnly"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"uidField",
"outlierGroupIdField",
"outlierGroupLabelField",
"freqTermField",
"distToCenterField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"outlierK",
"outlierThreshold",
"maxDF",
"minDF",
"norm",
"numKeywordsPerLabel"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "ALS Recommender (deprecated)",
"description": "Use this job when you want to compute user recommendations or item similarities using a collaborative filtering recommender. You can also implement a user-to-item recommender in the advanced section of this job’s configuration UI. Deprecated as of Fusion 5.2.0 and will be removed in a future release; use the BPR Recommender instead.",
"required": [
"id",
"trainingCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Recommender Training Collection",
"description": "User/Item preference collection (often a signals collection or signals aggregation collection)"
},
"outputCollection": {
"type": "string",
"title": "Items-for-users Recommendation Collection",
"description": "Collection to store batch-predicted user/item recommendations (if absent, none computed)"
},
"outputItemSimCollection": {
"type": "string",
"title": "Item-to-item Similarity Collection",
"description": "Collection to store batch-computed item/item similarities (if absent, none computed)"
},
"numRecs": {
"type": "integer",
"title": "Number of User Recommendations to Compute",
"description": "Batch compute and store this many item recommendations per user",
"default": 10
},
"numSims": {
"type": "integer",
"title": "Number of Item Similarites to Compute",
"description": "Batch compute and store this many item similarities per item",
"default": 10
},
"implicitRatings": {
"type": "boolean",
"title": "Implicit Preferences",
"description": "Treat training preferences as implicit signals of interest (i.e. clicks or other actions) as opposed to explicit item ratings",
"default": true
},
"deleteOldRecs": {
"type": "boolean",
"title": "Delete Old Recommendations",
"description": "Delete old recommendations after generating new recommendations.",
"default": true
},
"excludeFromDeleteFilter": {
"type": "string",
"title": "Exclude from Delete Filter",
"description": "If the 'Delete Old Recommendations' flag is enabled, then use this query filter to identify existing recommendation docs to exclude from delete. The filter should identify recommendation docs you want to keep.",
"hints": [
"advanced"
]
},
"outputUserRecsCollection": {
"type": "string",
"title": "Users-for-items Recommendation Collection",
"description": "Collection to store batch-predicted item/user recommendations (if absent, none computed)",
"hints": [
"advanced"
]
},
"numUserRecsPerItem": {
"type": "integer",
"title": "Number of Users to Recommend to each Item",
"description": "Batch compute and store this many user recommendations per item",
"default": 10,
"hints": [
"advanced"
]
},
"modelId": {
"type": "string",
"title": "Recommender Model ID",
"description": "Identifier for the recommender model. Will be used as the unique key when storing the model in Solr. If absent, it will default to the job ID.",
"hints": [
"advanced"
]
},
"saveModel": {
"type": "boolean",
"title": "Save Model in Solr",
"description": "Whether we should save the computed ALS model in Solr",
"default": false,
"hints": [
"advanced"
]
},
"modelCollection": {
"type": "string",
"title": "Model Collection",
"description": "Collection to load and store the computed model, if \"Save Model\" is true. Defaults to \"[app name]_recommender_models\"",
"hints": [
"advanced"
],
"minLength": 1
},
"alwaysTrain": {
"type": "boolean",
"title": "Force model re-training",
"description": "Even if a model with this modelId exists, re-train if set true",
"default": true,
"hints": [
"advanced"
]
},
"maxTrainingIterations": {
"type": "integer",
"title": "Maximum Training Iterations",
"description": "Maximum number of iterations to use when learning the matrix decomposition",
"default": 10,
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr query to filter training data (e.g. downsampling or selecting based on min. pref values)",
"default": "*:*",
"hints": [
"advanced"
]
},
"popularItemMin": {
"type": "integer",
"title": "Training Data Filter By Popular Items",
"description": "Items must have at least this # of unique users interacting with it to go into the sample",
"default": 2,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Downsample preferences for items (bounded to at least 2) by this fraction",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"userIdField": {
"type": "string",
"title": "Training Collection User Id Field",
"description": "Solr field name containing stored user ids",
"default": "user_id_s",
"hints": [
"advanced"
]
},
"itemIdField": {
"type": "string",
"title": "Training Collection Item Id Field",
"description": "Solr field name containing stored item ids",
"default": "item_id_s",
"hints": [
"advanced"
]
},
"weightField": {
"type": "string",
"title": "Training Collection Weight Field",
"description": "Solr field name containing stored weights or preferences the user has for that item",
"default": "weight_d",
"hints": [
"advanced"
]
},
"initialBlocks": {
"type": "integer",
"title": "Training Block Size",
"description": "Number of sub-matrix blocks to break the training data into (default: -1, for auto-sizing)",
"default": -1,
"hints": [
"hidden"
]
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Training DataFrame Config Options",
"description": "Additional Spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"initialRank": {
"type": "integer",
"title": "Recommender Rank",
"description": "Number of user/item factors in the recommender decomposition (or starting guess for it, if doing parameter grid search)",
"default": 100,
"hints": [
"advanced"
]
},
"initialAlpha": {
"type": "number",
"title": "Implicit Preference Confidence",
"description": "Confidence weight to give the implicit preferences (or starting guess, if doing parameter grid search)",
"default": 50,
"hints": [
"advanced"
]
},
"initialLambda": {
"type": "number",
"title": "Initial Lambda",
"description": "Smoothing parameter to avoid overfitting (or starting guess, if doing parameter grid search). Slightly larger value needed for small data sets",
"default": 0.01,
"hints": [
"advanced"
]
},
"gridSearchWidth": {
"type": "integer",
"title": "Grid Search Width",
"description": "Parameter grid search to be done centered around initial parameter guesses, exponential step size, this number of steps (if <= 0, no grid search). 1 is a reasonable number to start with.",
"default": 0,
"hints": [
"advanced"
]
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 13,
"hints": [
"advanced"
]
},
"itemMetadataFields": {
"type": "array",
"title": "Item Metadata Fields",
"description": "List of item metadata fields to include in the recommendation output documents.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"itemMetadataCollection": {
"type": "string",
"title": "Item Metadata Collection",
"description": "Fusion collection or catalog asset ID containing item metadata fields you want to add to the recommendation output documents.",
"hints": [
"advanced"
]
},
"itemMetadataJoinField": {
"type": "string",
"title": "Item Metadata Join Field",
"description": "Name of field in the item metadata collection to join on; defaults to the item id field configured for this job.",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format which training data comes in (like 'solr', 'hdfs', 'file', 'parquet' etc)",
"default": "solr",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"als_recommender"
],
"default": "als_recommender",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"outputUserRecsCollection",
"outputItemSimCollection",
"writeOptions"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"numSims",
"implicitRatings",
"deleteOldRecs"
]
},
{
"label": "Training Data Settings",
"properties": [
"trainingDataFilterQuery",
"popularItemMin",
"trainingSampleFraction",
"userIdField",
"itemIdField",
"weightField",
"maxIters",
"trainingDataFrameConfigOptions",
"initialBlocks"
]
},
{
"label": "Model Settings",
"properties": [
"modelId",
"saveModel",
"modelCollection",
"alwaysTrain"
]
},
{
"label": "Grid Search Settings",
"properties": [
"initialRank",
"gridSearchWidth",
"initialAlpha",
"initialLambda",
"randomSeed"
]
},
{
"label": "Item Metadata Settings",
"properties": [
"itemMetadataCollection",
"itemMetadataJoinField",
"itemMetadataFields"
]
}
]
},
{
"type": "object",
"title": "Upload Model Parameters To Cloud",
"description": "Upload a trained model's parameters to cloud storage",
"required": [
"id",
"modelName",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Seldon Core deployment to upload (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"cloudPath": {
"type": "string",
"title": "Cloud Path",
"description": "Path to cloud storage location that will contain the saved parameters for this model - the model version will be appended to the filename at the end of the path string. Supports S3, GCS, or Azure Blob Storage URIs"
},
"cloudSecret": {
"type": "string",
"title": "Kubernetes secret name for cloud storage access",
"description": "Defines the Kubernetes secret that will be used to access cloud storage"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-upload-model"
],
"default": "argo-upload-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Classification",
"description": "Trains a classification model to classify text documents by assigning a label to them.",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"textField",
"labelField",
"deployModelName",
"workflowType",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"stopwordsBlobName": {
"type": "string",
"title": "Stopwords Blob Store",
"description": "Name of the stopwords blob resource. This is a .txt file with one stopword per line. By default the file is called stopwords/stopwords_en.txt however a custom file can also be used. Check documentation for more details on format and uploading to blob store.",
"default": "stopwords/stopwords_en.txt",
"reference": "blob",
"blobType": "file:spark"
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"textField": {
"type": "string",
"title": "Training collection content field",
"description": "Solr field name containing the text to be classified",
"minLength": 1
},
"labelField": {
"type": "string",
"title": "Training collection class field",
"description": "Solr field name containing the classes/labels for the text",
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`.",
"hints": [
"code/sql",
"advanced"
]
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 12345,
"hints": [
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Choose a fraction of the data for training.",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"deployModelName": {
"type": "string",
"title": "Model Deployment Name",
"description": "Name of the model to be used for deployment (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"workflowType": {
"type": "string",
"title": "Method",
"description": "Method to be used for classification.",
"enum": [
"Logistic Regression",
"Starspace"
],
"default": "Logistic Regression"
},
"minCharLen": {
"type": "integer",
"title": "Minimum No. of Characters",
"description": "Minimum length, in characters, for the text to be included into training.",
"default": 2,
"minimum": 1,
"exclusiveMinimum": false
},
"maxCharLen": {
"type": "integer",
"title": "Maximum No. of Characters",
"description": "Maximum length, in characters, of the training text. Texts longer than this value will be truncated.",
"default": 100000,
"minimum": 1,
"exclusiveMinimum": false
},
"lowercaseTexts": {
"type": "boolean",
"title": "Lowercase Text",
"description": "Select if you want the text to be lowercased",
"default": true
},
"unidecodeTexts": {
"type": "boolean",
"title": "Unidecode Text",
"description": "Select if you want the text to be unidecoded",
"default": true
},
"minClassSize": {
"type": "integer",
"title": "Minimum no. of examples per class",
"description": "Minimum number of samples that class should have to be included into training. Otherwise the class and all its samples are dropped.",
"default": 5,
"minimum": 2,
"exclusiveMinimum": false
},
"valSize": {
"type": "number",
"title": "Validation set size",
"description": "Size of the validation dataset. Provide a float (0, 1) if you want to sample as a fraction, or an integer >= 1 if you want to sample exact number of records.",
"default": 0.1
},
"topK": {
"type": "integer",
"title": "Number of Output classes",
"description": "Number of most probable output classes to assign to each sample along with their scores.",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"featurizerType": {
"type": "string",
"title": "Featurizer",
"description": "The type of featurizer to use. TFIDF will compute both term-frequency and inverse document-frequency, whereas Count will use only term-frequency",
"enum": [
"tfidf",
"count"
],
"default": "tfidf",
"hints": [
"advanced"
]
},
"useCharacters": {
"type": "boolean",
"title": "Use Characters",
"description": "Whether to use the characters or word analyzer. Use words if the text is long. Using characters on long text can significantly increase vectorization time and memory requirements.",
"default": true
},
"tokenPattern": {
"type": "string",
"title": "Token filtering pattern",
"description": "Regex pattern for filtering tokens.",
"default": "(?u)\\b\\w\\w+\\b",
"hints": [
"hidden"
]
},
"minDf": {
"type": "number",
"title": "Min Document Frequency",
"description": "Minimum Df for token to be considered. Provide a float (0,1) if you want to specify as a fraction, otherwise integer >= 1 to specify the exact number of documents in which a token should occur.",
"default": 1,
"hints": [
"advanced"
]
},
"maxDf": {
"type": "number",
"title": "Max Document Frequency",
"description": "Maximum Df for token to be considered. Provide a float (0,1) if you want to specify as a fraction, otherwise integer >= 1 to specify the exact number of documents in which a token should occur",
"default": 0.8,
"hints": [
"advanced"
]
},
"minNgram": {
"type": "integer",
"title": "Min Ngram size",
"description": "Minimum word or character ngram size to be used.",
"minimum": 1,
"exclusiveMinimum": false
},
"maxNgram": {
"type": "integer",
"title": "Max Ngram size",
"description": "Maximum word or character ngram size to be used.",
"minimum": 1,
"exclusiveMinimum": false
},
"maxFeatures": {
"type": "integer",
"title": "Maximum Vocab Size",
"description": "Maximum number of tokens (including word or character ngrams) to consider for the vocabulary. Less frequent tokens will be omitted.",
"default": 250000,
"minimum": 1,
"exclusiveMinimum": false
},
"norm": {
"type": "string",
"title": "Use Norm",
"description": "Select the norm method to use.",
"enum": [
"None",
"L1",
"L2"
],
"default": "None",
"hints": [
"advanced"
]
},
"smoothIdf": {
"type": "boolean",
"title": "Smooth IDF",
"description": "Smooth IDF weights by adding one to document frequencies. Prevents zero divisions.",
"default": true,
"hints": [
"advanced"
]
},
"sublinearTf": {
"type": "boolean",
"title": "Sublinear TF",
"description": "Whether to apply sublinear scaling to TF, i.e. replace tf with 1 + log(tf). It usually helps when characters are used. ",
"default": true,
"hints": [
"advanced"
]
},
"scaling": {
"type": "boolean",
"title": "Scale Features",
"description": "Whether to apply Standard Scaling (X - mean(X)) / std(X) for the features. If the feature vector is sparse (no dimensionality reduction is used), then only division on standard deviation will be applied.",
"default": true
},
"dimReduction": {
"type": "boolean",
"title": "Perform Dimensionality Reduction",
"description": "Whether to perform dimensionality reduction or not. Truncated SVD is used to reduce dimensionality. Reduces overfitting and training time. Note that sparse vectors will become dense.",
"default": false
},
"dimReductionSize": {
"type": "integer",
"title": "Reduced Dimension Size",
"description": "The target dimension size of the features after dimensionality reduction.",
"default": 256,
"minimum": 1,
"exclusiveMinimum": false
},
"penalty": {
"type": "string",
"title": "Penalty",
"description": "Specify the norm used in the penalization. l2 is supported only by the ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers. ‘elasticnet’ is only supported by the ‘saga’ solver. Select none, if you don't want to regularize (this is not supported by the `liblinear` solver).",
"enum": [
"l1",
"l2",
"elsaticnet",
"none"
],
"default": "l2",
"hints": [
"advanced"
]
},
"l1Ratio": {
"type": "number",
"title": "L1 penalty ratio",
"description": "Only used with the `elasticnet` penalty. If its value = 0, l2 penalty will be used. If it's value = 1, l1 penalty will be used. A value in between will use the appropirate ratio of l1 and l2 penalties.",
"default": 0.5,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"tol": {
"type": "number",
"title": "Stopping tolerance",
"description": "Tolerance for stopping criteria.",
"default": 0.0001
},
"reg": {
"type": "number",
"title": "Regularization term",
"description": "This is the inverse of regularization strength. Smaller values result in stronger regularization.",
"default": 1
},
"useClassWeights": {
"type": "boolean",
"title": "Use class weights",
"description": "If true, a weight is applied to each class inversely proportional to its frequency.",
"default": false
},
"solver": {
"type": "string",
"title": "Optimization Algorithm",
"description": "The optimization algorithm to use to fit to the data. LBFGS and SAGA are good initial choices.",
"enum": [
"lbfgs",
"newton-cg",
"liblinear",
"sag",
"saga"
],
"default": "lbfgs",
"hints": [
"advanced"
]
},
"multiClass": {
"type": "string",
"title": "Loss Method",
"description": "Whether to train a binary classifier for each class or use a multinomial loss. ‘auto’ selects ‘ovr’ if the data is binary, or if algorithm=’liblinear’, and otherwise selects ‘multinomial’.",
"enum": [
"auto",
"ovr",
"multinomial"
],
"default": "auto",
"hints": [
"advanced"
]
},
"maxIter": {
"type": "integer",
"title": "Maximum iterations for algorithm",
"description": "Maximum number of iterations taken for the optimization algorithm to converge.",
"default": 200,
"minimum": 1,
"exclusiveMinimum": false
},
"textLayersSizes": {
"type": "string",
"title": "Hidden sizes before text embedding",
"description": "Sizes of hidden layers before the embedding layer for text. Specify as a list of numbers for multiple layers or a single number for 1 layer. Leave blank if no hidden layers are required.",
"default": "[256, 128]",
"pattern": "^(\\[(((\\d)*,\\s*)*(\\d+)+)?\\])?$"
},
"labelLayersSizes": {
"type": "string",
"title": "Hidden sizes before class embedding",
"description": "Sizes of hidden layers before the embedding layer for classes. Specify as a list of numbers for multiple layers or a single number for 1 layer. Leave blank if no hidden layers are required.",
"default": "[]",
"pattern": "^(\\[(((\\d)*,\\s*)*(\\d+)+)?\\])?$"
},
"embeddingsSize": {
"type": "integer",
"title": "Embedding size",
"description": "Dimension size of final embedding vectors for text and class.",
"default": 100,
"minimum": 1,
"exclusiveMinimum": false
},
"regTerm": {
"type": "number",
"title": "Regularization Term",
"description": "Scale of L2 regularization",
"default": 0.002
},
"dropout": {
"type": "number",
"title": "Dropout",
"description": "Probability for applying dropout regularization.",
"default": 0.2
},
"embeddingReg": {
"type": "number",
"title": "Embedding regularization",
"description": "The scale of how critical the algorithm should be of minimizing the maximum similarity between embeddings of different classes",
"default": 0.8,
"hints": [
"advanced"
]
},
"minBatchSize": {
"type": "integer",
"title": "Minimum Batch Size",
"description": "The smallest batch size with which to start training. Batch size will be increased linearly every epoch, upto the maximum batch size specified.",
"default": 64,
"minimum": 1,
"exclusiveMinimum": false
},
"maxBatchSize": {
"type": "integer",
"title": "Maximum Batch Size",
"description": "The largest batch size to use during training. Batch size will be increased linearly every epoch, upto the maximum batch size specified.",
"default": 128,
"minimum": 1,
"exclusiveMinimum": false
},
"numEpochs": {
"type": "integer",
"title": "Number of training epochs",
"description": "Number of epochs for which to train the model.",
"default": 40,
"minimum": 1,
"exclusiveMinimum": false
},
"muPos": {
"type": "number",
"title": "Maximum correct class similarity",
"description": "How similar algorithm should try to make embedding vectors for correct classes. The algorithm will try to maximize similarities so that it's higher than the value specified here.",
"default": 0.8,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"muNeg": {
"type": "number",
"title": "Maximum negative class similarity",
"description": "How similar algorithm should try to make embedding vectors for negative classes. The algorithm will try to minimize similarities so that it's lower than the value specified here.",
"default": -0.4,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"similarityType": {
"type": "string",
"title": "Similarity type",
"description": "Type of similarity to use to compare the embedded vectors.",
"enum": [
"cosine",
"inner"
],
"default": "cosine",
"hints": [
"advanced"
]
},
"numNeg": {
"type": "integer",
"title": "Number of negative classes for training",
"description": "Number of negative classes to use during training to minimize their similarity to the input text. Should be less than the total number of classes.",
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"useMaxNegSim": {
"type": "boolean",
"title": "Only minimize max. negative similarity",
"description": "If true, only the maximum similarity for negative classes will be minimized. If unchecked, all negative similarities will be used.",
"default": true,
"hints": [
"advanced"
]
},
"modelReplicas": {
"type": "integer",
"title": "Model replicas",
"description": "How many replicas of the model should be deployed by Seldon Core",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-classification"
],
"default": "argo-classification",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"deployModelName",
"trainingCollection",
"trainingFormat",
"modelReplicas",
"secretName"
]
},
{
"label": "Training Data Settings",
"properties": [
"trainingDataFilterQuery",
"trainingSampleFraction",
"randomSeed",
"textField",
"labelField"
]
},
{
"label": "Preprocessing Parameters",
"properties": [
"minCharLen",
"maxCharLen",
"minClassSize",
"lowercaseTexts",
"unidecodeTexts"
]
},
{
"label": "Eval and Output Parameters",
"properties": [
"valSize",
"topK"
]
},
{
"label": "Vectorization Parameters",
"properties": [
"featurizerType",
"useCharacters",
"stopwordsBlobName",
"minDf",
"maxDf",
"minNgram",
"maxNgram",
"maxFeatures",
"norm",
"smoothIdf",
"sublinearTf",
"scaling",
"dimReduction",
"dimReductionSize"
]
},
{
"label": "Logistic Regression Parameters",
"properties": [
"penalty",
"l1Ratio",
"tol",
"reg",
"useClassWeights",
"solver",
"multiClass",
"maxIter"
]
},
{
"label": "Starspace Parameters",
"properties": [
"textLayersSizes",
"labelLayersSizes",
"embeddingsSize",
"regTerm",
"dropout",
"embeddingReg",
"minBatchSize",
"maxBatchSize",
"numEpochs",
"muPos",
"muNeg",
"similarityType",
"numNeg",
"useMaxNegSim"
]
}
]
},
{
"type": "object",
"title": "Delete Ray Model Deployment",
"description": "Removes a Ray model deployment from the cluster",
"required": [
"id",
"modelName",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Ray deployment to delete",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-delete-ray-model"
],
"default": "argo-delete-ray-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Delete Collections in Milvus (deprecated)",
"description": "Deletes specified collections in Milvus",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"collections-list": {
"type": "array",
"title": "Collections",
"description": "List of collections in Milvus that should be deleted.",
"items": {
"type": "string"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-milvus-delete-collections"
],
"default": "argo-milvus-delete-collections",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Ground Truth",
"description": "Use this job when you want to estimate ground truth queries using click and query signals with document relevance per query determined using a click/skip formula. Pair this job with ranking metrics job to calculate relevance metrics, such as nDCG",
"required": [
"id",
"signalsCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"signalsCollection": {
"type": "string",
"title": "Signals collection",
"description": "Collection containing click signals and the associated search log identifier",
"minLength": 1
},
"searchLogsAddOpts": {
"type": "object",
"title": "Search Logs and Options",
"description": "Additional options to use while loading search logs collection",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"signalsAddOpts": {
"type": "object",
"title": "Additional Signals Options",
"description": "Additional options to use while loading signals collection",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"searchLogsPipeline": {
"type": "string",
"title": "Search Logs Pipeline",
"description": "Pipeline id associated with search log entries",
"hints": [
"advanced"
],
"minLength": 1
},
"joinKeySearchLogs": {
"type": "string",
"title": "Join Key (Query Signals)",
"description": "Join key of query signals in the signals collection",
"default": "id",
"hints": [
"advanced"
]
},
"joinKeySignals": {
"type": "string",
"title": "Join Key (Click Signals)",
"description": "Join key of click signals in the signals collection",
"default": "fusion_query_id",
"hints": [
"advanced"
]
},
"filterQueries": {
"type": "array",
"title": "Filter Queries",
"description": "Filter queries to apply while choosing top queries from query signals in signals collection",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"topQueriesLimit": {
"type": "integer",
"title": "Top Queries Limit",
"description": "Total number of queries to pick for Ground truth calculations",
"default": 100,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"ground_truth"
],
"default": "ground_truth",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"signalsCollection"
]
},
{
"label": "Additional Options",
"properties": [
"searchLogsPipeline",
"joinKeySearchLogs",
"joinKeySignals",
"searchLogsAddOpts",
"signalsAddOpts",
"filterQueries",
"topQueriesLimit"
]
}
]
},
{
"type": "object",
"title": "Create Seldon Core Model Deployment",
"description": "Deploys a Seldon Core Model into the Fusion cluster",
"required": [
"id",
"deployModelName",
"modelDockerRepo",
"modelDockerImage",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"deployModelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Seldon Core deployment to deploy (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"modelReplicas": {
"type": "integer",
"title": "Model replicas",
"description": "How many replicas of the model should be deployed by Seldon Core",
"default": 1
},
"modelDockerRepo": {
"type": "string",
"title": "Docker repository",
"description": "Defines the Docker repository where the model image is located."
},
"modelDockerImage": {
"type": "string",
"title": "Image name",
"description": "Name of the model's docker image"
},
"modelDockerSecret": {
"type": "string",
"title": "Kubernetes secret name for model repo",
"description": "Defines the Kubernetes secret to be used with the Docker repository"
},
"columnNames": {
"type": "string",
"title": "Output column names for model",
"description": "A list of column names that the model generates which the ML Service will return after inference.",
"default": "[output1, output2]"
},
"cloudPath": {
"type": "string",
"title": "Cloud Path",
"description": "Path to cloud storage location that contains the saved parameters for this model. Supports S3, GCS, or Azure Blob Storage URIs",
"hints": [
"advanced"
]
},
"cloudSecret": {
"type": "string",
"title": "Kubernetes secret name for cloud storage access",
"description": "Defines the Kubernetes secret that will be used to access cloud storage",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-deploy-model"
],
"default": "argo-deploy-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Trending Recommender",
"description": "Trending Recommender",
"required": [
"id",
"trainingCollection",
"dataFormat",
"refTimeRange",
"targetTimeRange",
"countField",
"typeField",
"timeField",
"docIdField",
"types",
"recsCount",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Solr Fields to Read",
"description": "Fields to extract from Solr (not used for other formats)",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"refTimeRange": {
"type": "integer",
"title": "Reference Time Days",
"description": "Number of reference days: number of days to use as baseline to find trends (calculated from today)"
},
"targetTimeRange": {
"type": "integer",
"title": "Target Time Days",
"description": "Number of target days: number of days to use as target to find trends (calculated from today)"
},
"numWeeksRef": {
"type": "number",
"title": "Num Weeks Reference",
"description": "If using filter queries for reference and target time ranges, enter the value of (reference days / target days) here (if not using filter queries, this will be calculated automatically)",
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"countField": {
"type": "string",
"title": "Event Count Field Name",
"description": "Field containing the number of times an event (e.g. click) occurs for a particular query; count_i in the raw signal collection or aggr_count_i in the aggregated signal collection.",
"default": "aggr_count_i",
"minLength": 1
},
"referenceTimeFilterQuery": {
"type": "string",
"title": "Reference Filter Time Query",
"description": "Add a Spark SQL filter query here for greater control of time filtering",
"hints": [
"advanced"
]
},
"targetFilterTimeQuery": {
"type": "string",
"title": "Target Filter Time Query",
"description": "Add a Spark SQL filter query here for greater control of time filtering",
"hints": [
"advanced"
]
},
"typeField": {
"type": "string",
"title": "Type field",
"description": "Enter type field (default is type)",
"default": "aggr_type_s"
},
"timeField": {
"type": "string",
"title": "Time field",
"description": "Enter time field (default is timestamp_tdt)",
"default": "timestamp_tdt"
},
"docIdField": {
"type": "string",
"title": "Document ID field",
"description": "Enter document id field (default is doc_id)",
"default": "doc_id_s"
},
"types": {
"type": "string",
"title": "Event types",
"description": "Enter a comma-separated list of event types to filter on",
"default": "click,add"
},
"recsCount": {
"type": "integer",
"title": "Recommendation Count",
"description": "Maximum number of recs to generate (or -1 for no limit)",
"default": 500
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"trending-recommender"
],
"default": "trending-recommender",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"countField"
]
}
]
},
{
"type": "object",
"title": "Smart Answers Evaluate Pipeline",
"description": "Evaluates performance of a configured pipeline",
"required": [
"id",
"inputEvaluationCollection",
"trainingFormat",
"outputEvaluationCollection",
"outputFormat",
"appName",
"queryPipelineName",
"collectionName",
"returnFields",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"inputEvaluationCollection": {
"type": "string",
"title": "Input Evaluation Data Path",
"description": "Cloud storage path or Solr collection to pull labeled data for use in evaluation",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Input data format",
"description": "The format of the input data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"outputEvaluationCollection": {
"type": "string",
"title": "Output Evaluation Data Path",
"description": "Cloud storage path or Solr collection to store evaluation results (recommended collection is job_reports)",
"minLength": 1
},
"partitionFields": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"batchSize": {
"type": "string",
"title": "Output Batch Size",
"description": "If writing to solr, this field defines the batch size for documents to be pushed to solr.",
"hints": [
"advanced"
]
},
"outputFormat": {
"type": "string",
"title": "Output format",
"description": "The format of the output data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
"hints": [
"code/sql",
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Sampling proportion",
"description": "The proportion of data to be sampled from the full dataset. Use a value between 0 and 1 for a proportion (e.g. 0.5 for 50%), or for a specific number of examples, use an integer larger than 1. Leave blank for no sampling",
"hints": [
"advanced"
]
},
"seed": {
"type": "integer",
"title": "Sampling Seed",
"description": "Random seed for sampling",
"default": 12345,
"hints": [
"advanced"
]
},
"testQuestionFieldInFile": {
"type": "string",
"title": "Test Question Field",
"description": "Defines the field in the collection containing the test question",
"default": "question"
},
"matchFieldInFile": {
"type": "string",
"title": "Ground Truth Field",
"description": "Field which contains id or text of the ground truth answer in the evaluation collection",
"default": "answer_id"
},
"matchFieldInFusion": {
"type": "string",
"title": "Answer or id Field in Fusion",
"description": "Field name in Fusion which contains answer id or text for matching ground truth answer id or text in the evaluation collection",
"default": "doc_id"
},
"appName": {
"type": "string",
"title": "App name",
"description": "Fusion app where indexed documents or QA pairs live."
},
"queryPipelineName": {
"type": "string",
"title": "Fusion Query Pipeline",
"description": "Configured query pipeline name that should be used for evaluation"
},
"collectionName": {
"type": "string",
"title": "Main Collection",
"description": "Fusion collection where indexed documents or QA pairs live"
},
"additionalParams": {
"type": "string",
"title": "Additional query parameters",
"description": "Additional query parameters to pass to return resultsfrom Fusion. Please specify in dictionary format: e.g. { \"rowsFromSolrToRerank\": 20,\"fq\": \"type:answer\" }\"",
"hints": [
"advanced"
]
},
"returnFields": {
"type": "string",
"title": "Return fields",
"description": "Fields (comma-separated) that should be returned from the main collection (e.g. question, answer). The job will add them to the output evaluation"
},
"rankingScoreField": {
"type": "string",
"title": "Ranking score",
"description": "Score to be used for ranking and evaluation",
"default": "ensemble_score",
"hints": [
"advanced"
]
},
"metricsList": {
"type": "string",
"title": "Metrics list",
"description": "List of metrics that should be computed during evaluation. e.g.[\"recall\",\"precision\",\"map\",\"mrr\"]",
"default": "[\"recall\",\"map\",\"mrr\"]",
"hints": [
"advanced"
]
},
"kList": {
"type": "string",
"title": "Metrics@k list",
"description": "The k retrieval position that will be used to compute for each metric",
"default": "[1,3,5]",
"hints": [
"advanced"
]
},
"doWeightsSelection": {
"type": "boolean",
"title": "Perform weights selection",
"description": "Whether to perform grid search to find the best weights combination for ranking scores for query pipeline's Compute Mathematical Expression stage\"",
"default": false,
"hints": [
"advanced"
]
},
"solrScaleFunc": {
"type": "string",
"title": "Solr scale function",
"description": "Function used in the pipeline to scale Solr scores. E.g., scale by max Solr score retrieved (max), scale by log with base 10 (log10) or take squre root of score (pow0.5)",
"default": "max"
},
"scoreListForWeights": {
"type": "string",
"title": "List of ranking scores for ensemble",
"description": "Ranking scores (comma-separated) used for ensemble in the query pipeline's Compute Mathematical Expression stage. The job will perform weights selection for the listed scores",
"default": "score,vectors_distance"
},
"targetRankingMetric": {
"type": "string",
"title": "Target metric to use for weight selection",
"description": "Target ranking metric to optimize during weights selection",
"default": "mrr@3"
},
"fetcherType": {
"type": "string",
"title": "Fetcher Type to use with query evaluation",
"default": "query-service",
"hints": [
"hidden"
]
},
"useLabelingResolution": {
"type": "boolean",
"title": "Use Labeling Resolution",
"description": "Check this to determine similar questions and similar answers via labeling resolution and graph connected components. Does not work well with signals data.",
"default": false,
"hints": [
"advanced"
]
},
"useConcurrentQuerying": {
"type": "boolean",
"title": "Use Concurrent Querying",
"description": "Check this option if you want to make concurrent queries to Fusion. It will greatly speed up the job at the cost of increased load on Fusion. Use with caution.",
"default": false,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-qna-evaluate"
],
"default": "argo-qna-evaluate",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input / Output Parameters",
"properties": [
"inputEvaluationCollection",
"trainingFormat",
"outputEvaluationCollection",
"outputFormat",
"trainingDataFilterQuery",
"testQuestionFieldInFile",
"matchFieldInFile",
"trainingSampleFraction",
"seed",
"useLabelingResolution",
"partitionFields",
"batchSize",
"secretName"
]
},
{
"label": "Query Pipeline Input / Output Parameters",
"properties": [
"appName",
"collectionName",
"queryPipelineName",
"matchFieldInFusion",
"additionalParams",
"returnFields",
"useConcurrentQuerying"
]
},
{
"label": "Metrics",
"properties": [
"rankingScoreField",
"metricsList",
"kList",
"doWeightsSelection",
"solrScaleFunc",
"scoreListForWeights",
"targetRankingMetric"
]
}
]
},
{
"type": "object",
"title": "Script",
"description": "Run a custom Scala script as a Fusion Job.",
"required": [
"id",
"script",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"script": {
"type": "string",
"title": "Scala Script",
"description": "Custom script written in Scala to be executed in Fusion as a Spark job.",
"hints": [
"lengthy",
"code/scala"
],
"minLength": 1
},
"shellOptions": {
"type": "array",
"title": "Spark Shell Options",
"description": "Additional options to pass to the Spark shell when running this job.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"interpreterParams": {
"type": "array",
"title": "Interpreter Params",
"description": "Bind the key/values to the Scala interpreter",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"script"
],
"default": "script",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Synonym Detection (Deprecated)",
"description": "Use this job to generate synonym and similar query pairs. This job is deprecated.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"misspellingSQLDataFormat",
"phraseSQLDataFormat",
"countField",
"docIdField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input Collection",
"description": "Collection containing queries, document id and event counts. Can be either signal aggregation collection or raw signals collection.",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Query Field Name",
"description": "Field containing queries. Change to query to use against raw signals",
"default": "query_s",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Data filter query",
"description": "Solr query to use when loading training data if using Solr, Spark SQL expression for all other data sources",
"default": "*:*",
"hints": [
"dummy"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Collection to store synonym and similar query pairs.",
"hints": [
"dummy"
]
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"misspellingCollection": {
"type": "string",
"title": "Misspelling Job Result Collection",
"description": "Solr collection containing reviewed result of Token and phrase spell correction job. Defaults to the query_rewrite_staging collection for the app."
},
"misspellingsFilterQuery": {
"type": "string",
"title": "Misspelling Job Result Filter Query",
"description": "Solr query to additionally filter the misspelling results. Defaults to reading all approved spell corrections.",
"default": "type:spell"
},
"keyPhraseCollection": {
"type": "string",
"title": "Phrase Extraction Job Result Collection",
"description": "Solr collection containing reviewed result of Phrase extraction job. Defaults to the query_rewrite_staging collection for the app."
},
"keyPhraseFilterQuery": {
"type": "string",
"title": "Phrase Extraction Job Result Filter Query",
"description": "Solr query to additionally filter the phrase extraction results. Defaults to reading all approved phrases.",
"default": "type:phrase"
},
"misspellingSQL": {
"type": "string",
"title": "Spark SQL filter query for misspelling data",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spell_input",
"default": "SELECT surface_form AS misspelling_s, output AS correction_s FROM spell_input WHERE doc_type = 'query_rewrite' AND type = 'spell' AND review IN ('approved' OR 'auto')",
"hints": [
"code/sql",
"advanced"
]
},
"misspellingSQLDataFormat": {
"type": "string",
"title": "Misspelling Data format",
"description": "Spark-compatible format that contains spelling data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"phraseSQL": {
"type": "string",
"title": "Spark SQL filter query for phrase data",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as phrase_input",
"default": "SELECT surface_form AS phrases_s, coalesce(confidence, lit(1d)) AS likelihood_d, coalesce(word_count, lit(1d)) AS word_num_i FROM phrase_input WHERE doc_type = 'query_rewrite' AND type = 'phrase' AND review IN ('approved' OR 'auto')",
"hints": [
"code/sql",
"advanced"
]
},
"phraseSQLDataFormat": {
"type": "string",
"title": "Phrase Data format",
"description": "Spark-compatible format that contains phrase data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"countField": {
"type": "string",
"title": "Event Count Field Name",
"description": "Solr field containing number of events (e.g., number of clicks). Change to count_i when running against raw signals",
"default": "aggr_count_i"
},
"docIdField": {
"type": "string",
"title": "Document id Field Name",
"description": "Solr field containing document id that user clicked. Change to doc_id for raw signal collection",
"default": "doc_id_s "
},
"overlapThreshold": {
"type": "number",
"title": "Query Similarity Threshold",
"description": "The threshold above which query pairs are consider similar. We can get more synonym pairs if increase this value but quality may get reduced.",
"default": 0.5,
"hints": [
"advanced"
]
},
"similarityThreshold": {
"type": "number",
"title": "Synonym Similarity Threshold",
"description": "The threshold above which synonym pairs are consider similar. We can get more synonym pairs if increase this value but quality may get reduced.",
"default": 0.01,
"hints": [
"advanced"
]
},
"minQueryCount": {
"type": "integer",
"title": "Query Clicks Threshold",
"description": "The min number of clicked documents needed for comparing queries.",
"default": 5,
"hints": [
"advanced"
]
},
"keywordsBlobName": {
"type": "string",
"title": "Keywords Blob Store",
"description": "Name of the keywords blob resource. Typically, this should be a csv file uploaded to blob store in a specific format. Check documentation for more details on format and uploading to blob store.",
"reference": "blob",
"blobType": "file:spark"
},
"synonymBlobName": {
"type": "string",
"title": "Custom Synonym Blob Store",
"description": "Name of the custom synonym blob resource. This is a Solr synonym file that will be used in the synonym detection job and will override any generated synonyms (indicated by a 'supplied' field in the Rules UI).",
"hints": [
"advanced"
],
"reference": "blob",
"blobType": "file:spark"
},
"analyzerConfigQuery": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenizing queries (JSON-encoded)",
"default": "{ \"analyzers\": [ { \"name\": \"LetterTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"letter\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"KStem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"LetterTokLowerStem\" } ]}",
"hints": [
"lengthy",
"advanced",
"code/json"
],
"minLength": 1
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"enableAutoPublish": {
"type": "boolean",
"title": "Enable auto-publishing",
"description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
"default": false,
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"synonymDetection"
],
"default": "synonymDetection",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"countField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"overlapThreshold"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfigQuery"
]
},
{
"label": "Misc. Parameters",
"properties": [
"keywordsBlobName"
]
}
]
},
{
"type": "object",
"title": "Cluster Labeling",
"description": "Use this job when you already have clusters or well-defined document categories, and you want to discover and attach keywords to see representative words within those existing clusters. (If you want to create new clusters, use the Document Clustering job.)",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"clusterIdField",
"outputCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing documents with defined categories or clusters",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to detect keywords from",
"description": "Field containing data from which to discover keywords for the cluster",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store output data to",
"minLength": 1
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"clusterIdField": {
"type": "string",
"title": "Existing Document Category Field",
"description": "Field that contains your existing cluster IDs or document categories.",
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"lengthy",
"code/json"
],
"minLength": 1
},
"clusterLabelField": {
"type": "string",
"title": "Top Unique Terms Field Name",
"description": "Output field name for top frequent terms that are (mostly) unique for each cluster.",
"default": "cluster_label"
},
"freqTermField": {
"type": "string",
"title": "Top Frequent Terms Field Name",
"description": "Output field name for top frequent terms in each cluster. These may overlap with other clusters.",
"default": "freq_terms"
},
"minDF": {
"type": "number",
"title": "Min Doc Support",
"description": "Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 5
},
"maxDF": {
"type": "number",
"title": "Max Doc Support",
"description": "Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 0.75
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"numKeywordsPerLabel": {
"type": "integer",
"title": "Number of Keywords for Each Cluster",
"description": "Number of Keywords needed for labeling each cluster.",
"default": 5
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"cluster_labeling"
],
"default": "cluster_labeling",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"clusterIdField",
"freqTermField",
"clusterLabelField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"maxDF",
"minDF",
"norm",
"numKeywordsPerLabel"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Build Training Data",
"description": "Use this job to build training data for query classification by joining signals with catalog.",
"required": [
"id",
"fieldToVectorize",
"catalogPath",
"catalogFormat",
"signalsPath",
"outputPath",
"categoryField",
"catalogIdField",
"itemIdField",
"countField",
"analyzerConfig",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"hints": [
"dummy",
"hidden"
],
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Query Field",
"description": "Field containing query strings.",
"default": "query_s",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Signals Format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"hints": [
"dummy"
],
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Signal Data Filter Query",
"description": "Solr query to additionally filter signals. For non-solr data source use SPARK SQL FILTER QUERY under Advanced to filter results",
"default": "*:*",
"hints": [
"dummy"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to",
"hints": [
"dummy",
"hidden"
]
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"dummy"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"dummy",
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"catalogPath": {
"type": "string",
"title": "Catalog Path",
"description": "Catalog collection or cloud storage path which contains item categories."
},
"catalogFormat": {
"type": "string",
"title": "Catalog Format",
"description": "Spark-compatible format that contains catalog data (like 'solr', 'parquet', 'orc' etc)"
},
"signalsPath": {
"type": "string",
"title": "Signals Path",
"description": "Signals collection or cloud storage path which contains item categories."
},
"outputPath": {
"type": "string",
"title": "Output Path",
"description": "Output collection or cloud storage path which contains item categories."
},
"categoryField": {
"type": "string",
"title": "Category Field in Catalog",
"description": "Item category field in catalog."
},
"catalogIdField": {
"type": "string",
"title": "Item Id Field in Catalog",
"description": "Item Id field in catalog, which will be used to join with signals"
},
"itemIdField": {
"type": "string",
"title": "Item Id Field in Signals",
"description": "Item Id field in signals, which will be used to join with catalog.",
"default": "doc_id_s"
},
"countField": {
"type": "string",
"title": "Count Field in Signals",
"description": "Count Field in raw or aggregated signals.",
"default": "aggr_count_i"
},
"topCategoryProportion": {
"type": "number",
"title": "Top Category Proportion",
"description": "Proportion of the top category has to be among all categories.",
"default": 0.5
},
"topCategoryThreshold": {
"type": "integer",
"title": "Minimum Count",
"description": "Minimum number of query,category pair counts.",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Text Analyzer",
"description": "The style of text analyzer you would like to use.",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"lengthy",
"code/json"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"build-training"
],
"default": "build-training",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed",
"catalogPath",
"catalogFormat",
"signalsPath",
"outputPath",
"dataOutputFormat",
"partitionCols",
"sparkSQL"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"categoryField",
"catalogIdField",
"itemIdField",
"countField"
]
},
{
"label": "Training Parameters",
"properties": [
"topCategoryProportion",
"topCategoryThreshold"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
}
]
},
{
"type": "object",
"title": "Transfer Collection To Cloud",
"description": "Transfer Collection to Cloud Storage, for collections that need to be migrated or copied to cloud storage",
"required": [
"id",
"inputCollection",
"outputLocation",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"inputCollection": {
"type": "string",
"title": "Collection",
"description": "Solr collection to copy",
"minLength": 1
},
"outputLocation": {
"type": "string",
"title": "Output Location",
"description": "URI of output location (e.g. s3a://..., gs://..., wasb://...)",
"minLength": 1
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true
},
"outputFormat": {
"type": "string",
"title": "Output format",
"description": "Format for cloud output (e.g. parquet, json, csv)",
"default": "parquet"
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"transfer"
],
"default": "transfer",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "BPR Recommender",
"description": "Use this job when you want to compute user recommendations or item similarities using a Bayesian Personalized Ranking recommender. You can also implement a user-to-item recommender in the advanced section of this job’s configuration UI.",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"outputFormat",
"userIdField",
"itemIdField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"outputBatchSize": {
"type": "string",
"title": "Output Batch Size",
"description": "Batch size of documents when pushing results to solr",
"default": "15000",
"hints": [
"advanced"
]
},
"jobRunName": {
"type": "string",
"title": "Job Run Name",
"description": "Identifier for this job run. Use it to filter recommendations from particular runs.",
"hints": [
"advanced"
]
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"outputUserRecsCollection": {
"type": "string",
"title": "Items-Users Output Path",
"description": "Solr collection or cloud storage path to store batch-predicted user/item recommendations (if absent, none computed). Specify at least one of Items-Users Output Collection or Items-Items Output Collection.",
"minLength": 1
},
"outputItemSimCollection": {
"type": "string",
"title": "Item-Items Output Path",
"description": "Solr collection or cloud storage path to store batch-computed item/item similarities (if absent, none computed). Specify at least one of Items-Users Output Collection or Items-Items Output Collection.",
"minLength": 1
},
"outputFormat": {
"type": "string",
"title": "Output data format",
"description": "The format of the output data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"partitionFields": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output.",
"hints": [
"advanced"
]
},
"numRecsPerUser": {
"type": "integer",
"title": "No. of recs per user",
"description": "Number of recommendations that will be saved per user.",
"default": 10,
"minimum": 0,
"exclusiveMinimum": false
},
"userTopkAnn": {
"type": "integer",
"title": "No. of User Recs to Compute for Filtering",
"description": "Applies only when Filter Already Clicked Items is enabled. This is used to fetch additional recommendations so that the value specified for the Number of Recommendations Per User is most likely satisfied with filtering turned on.",
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"numSimsPerItem": {
"type": "integer",
"title": "No. of recs per item",
"description": "Number of recommendations that will be saved per item.",
"default": 10,
"minimum": 0,
"exclusiveMinimum": false
},
"deleteOldRecs": {
"type": "boolean",
"title": "Delete Old Recommendations",
"description": "Should previous recommendations be deleted. If this box is unchecked, then old recommendations will not be deleted but new recommendations will be appended with a different Job ID. Both sets of recommendations will be contained within the same collection. Will only work when output path is solr.",
"default": true
},
"excludeFromDeleteFilter": {
"type": "string",
"title": "Exclude from Delete Filter",
"description": "If the 'Delete Old Recommendations' flag is enabled, then use this query filter to identify existing recommendation docs to exclude from delete. The filter should identify recommendation docs you want to keep.",
"hints": [
"advanced"
]
},
"filterClicked": {
"type": "boolean",
"title": "Filter already clicked items",
"description": "Whether to filter out already clicked items in item recommendations for user. Takes more time but drastically improves quality.",
"default": true,
"hints": [
"advanced"
]
},
"weightField": {
"type": "string",
"title": "Training Collection Counts/Weights Field",
"description": "Solr field name containing stored counts/weights the user has for that item. This field is used as weight during training",
"default": "aggr_count_i"
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`.",
"hints": [
"code/sql",
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Choose a fraction of the data for training.",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"userIdField": {
"type": "string",
"title": "Training Collection User Id Field",
"description": "Solr field name in the training collection that contains stored User ID.",
"default": "user_id_s",
"minLength": 1
},
"itemIdField": {
"type": "string",
"title": "Training Collection Item Id Field",
"description": "Solr field name in the training collection that contains stored Item ID.",
"default": "item_id_s",
"minLength": 1
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 12345,
"hints": [
"advanced"
]
},
"itemMetadataFields": {
"type": "array",
"title": "Item Metadata Fields",
"description": "List of item metadata fields to include in the recommendation output documents. WARNING: Adding many fields can lead to huge output sizes or OOM issues.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"itemMetadataCollection": {
"type": "string",
"title": "Item Metadata Path",
"description": "Cloud storage path or Solr collection containing item metadata fields you want to add to the recommendation output documents. Leave blank and fill in the metadata fields if you want to fetch data from the training collection. Join field needs to be specified.",
"hints": [
"advanced"
]
},
"itemMetadataFormat": {
"type": "string",
"title": "Metdata format",
"description": "The format of the metadata - solr, parquet etc.",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"itemMetadataJoinField": {
"type": "string",
"title": "Item Metadata Join Field",
"description": "Name of field in the item metadata collection to join on.",
"hints": [
"advanced"
]
},
"performANN": {
"type": "boolean",
"title": "Perform approximate nearest neighbor search",
"description": "Whether to perform approximate nearest neighbor search (ANN). ANN will drastically reduce training time, but accuracy will drop a little. Disable only if training dataset is very small.",
"default": true
},
"maxNeighbors": {
"type": "integer",
"title": "Max neighbors for indexing",
"description": "If perform ANN, size of the potential neighbors for the indexing phase. Higher value leads to better recall and shorter retrieval times (at the expense of longer indexing time).Reasonable range: 5~100",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"searchNN": {
"type": "integer",
"title": "Search Depth",
"description": "If perform ANN, the depth of search used to find neighbors. Higher value improves recall at the expense of longer retrieval time.Reasonable range: 100~2000",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"indexNN": {
"type": "integer",
"title": "Indexing Depth",
"description": "If perform ANN, the depth of constructed index. Higher value improves recall at the expense of longer indexing time.Reasonable range: 100~2000",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"factors": {
"type": "integer",
"title": "Dimension of latent factors",
"description": "Latent factor dimension used for matrix decomposition. Bigger values require more time and memory but usually provide better results.",
"default": 100,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"epochs": {
"type": "integer",
"title": "Training iterations",
"description": "Number of model training iterations. Model will converge better with larger number at the expense of increased training time. For bigger datasets use smaller values.",
"default": 30,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"learningRate": {
"type": "number",
"title": "Learning rate",
"description": "Model learning rate.",
"default": 0.05,
"hints": [
"advanced"
]
},
"metadataCategoryFields": {
"type": "array",
"title": "Metadata fields for item-item evaluation",
"description": "These fields will be used for item-item evaluation and for determining if the recommendation pair belong to the same category.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"minNumItemUniqueClicks": {
"type": "integer",
"title": "Training Data Filtered By Popular Items",
"description": "Items must have at least this no. of unique user interactions to be included for training and recommendations. The higher this value, the more popular items selected but the amount of training data will reduce.",
"default": 2,
"minimum": 1,
"exclusiveMinimum": false
},
"minNumUserUniqueClicks": {
"type": "integer",
"title": "Training Data Filtered By User clicks",
"description": "Users must have at least this no. of unique item interactions to be included for training and recommendations. The higher this value, the more active users are selected but the amount of training data will reduce.",
"default": 2,
"minimum": 1,
"exclusiveMinimum": false
},
"minNumClickedProducts": {
"type": "integer",
"title": "Minimum Clicked Products",
"description": "Minimum number of clicked products the user should have to be a candidate for the test set.",
"default": 3,
"minimum": 2,
"exclusiveMinimum": false
},
"maxNumTestUsers": {
"type": "integer",
"title": "Maximum Test Users",
"description": "Maximum number of test users to choose. If more users satisfying the Minimum Clicked Products criterion are present, the number will be capped to what is specified here.",
"default": 10000,
"minimum": 0,
"exclusiveMinimum": false
},
"numTestUserClicks": {
"type": "integer",
"title": "Number of User Clicks for Test",
"description": "How many test user clicks to use for testing. Should be less than the value for Minimum Clicked Products.",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"doEvaluation": {
"type": "boolean",
"title": "Evaluate on test data",
"description": "Evaluate how well the trained model predicts user clicks. Test data will be sampled from original dataset."
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-item-recommender-user"
],
"default": "argo-item-recommender-user",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"outputUserRecsCollection",
"outputItemSimCollection",
"outputFormat",
"outputBatchSize",
"secretName",
"partitionFields"
]
},
{
"label": "Training Data Settings",
"properties": [
"trainingDataFilterQuery",
"trainingSampleFraction",
"userIdField",
"itemIdField",
"weightField",
"trainingDataFrameConfigOptions"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"numRecsPerUser",
"numSimsPerItem",
"filterClicked",
"userTopkAnn",
"minNumItemUniqueClicks",
"minNumUserUniqueClicks",
"maxIters",
"deleteOldRecs",
"excludeFromDeleteFilter",
"performANN",
"maxNeighbors",
"searchNN",
"indexNN",
"factors",
"epochs",
"learningRate",
"randomSeed"
]
},
{
"label": "Evaluation Parameters",
"properties": [
"doEvaluation",
"minNumClickedProducts",
"numTestUserClicks",
"maxNumTestUsers"
]
},
{
"label": "Item Metadata Settings",
"properties": [
"itemMetadataCollection",
"itemMetadataFormat",
"itemMetadataJoinField",
"itemMetadataFields",
"metadataCategoryFields"
]
}
]
},
{
"type": "object",
"title": "Query-to-Query Collaborative Similarity (deprecated)",
"description": "Use this job to to batch compute query-query similarities using ALS. Deprecated as of Fusion 5.2.0 and will be removed in a future release; use the Query-to-Query Session Based Similarity job instead.",
"required": [
"id",
"trainingCollection",
"outputQuerySimCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Recommender Model ID",
"description": "Identifier for the recommender model. Will be used as the unique key when storing the model in Solr.",
"hints": [
"advanced"
]
},
"modelCollection": {
"type": "string",
"title": "Model Collection",
"description": "Collection to load and store the computed model (if absent, it won't be loaded or saved)",
"hints": [
"advanced"
]
},
"saveModel": {
"type": "boolean",
"title": "Save Model in Solr",
"description": "Whether we should save the computed ALS model in Solr",
"default": false,
"hints": [
"advanced"
]
},
"trainingCollection": {
"type": "string",
"title": "Recommender Training Collection",
"description": "Item/Query preference collection (often a signals collection or signals aggregation collection)"
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr query to filter training data (e.g. downsampling or selecting based on min. pref values)",
"default": "*:*",
"hints": [
"advanced"
]
},
"popularQueryMin": {
"type": "integer",
"title": "Training Data Filter By Popular Items",
"description": "Items must have at least this # of unique users interacting with it to go into the sample",
"default": 2,
"hints": [
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Downsample preferences for items (bounded to at least 2) by this fraction",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"outputQuerySimCollection": {
"type": "string",
"title": "Query-to-query Similarity Collection",
"description": "Collection to store batch-computed query/query similarities (if absent, none computed)"
},
"outputItemsForQueriesCollection": {
"type": "string",
"title": "Items-for-query Boosting Collection",
"description": "Collection to store batch-computed items-for-queries recommendations (if absent, none computed)"
},
"queryField": {
"type": "string",
"title": "Training Collection Query Field",
"description": "Solr field name containing stored queries",
"default": "query",
"hints": [
"advanced"
]
},
"itemIdField": {
"type": "string",
"title": "Training Collection Item Id Field",
"description": "Solr field name containing stored item ids",
"default": "item_id_s",
"hints": [
"advanced"
]
},
"weightField": {
"type": "string",
"title": "Training Collection Weight Field",
"description": "Solr field name containing stored weights (i.e. time decayed / position weighted counts) the item has for that query",
"default": "weight_d",
"hints": [
"advanced"
]
},
"numSims": {
"type": "integer",
"title": "Number of Query Similarities to Compute",
"description": "Batch compute and store this many query similarities per query",
"default": 10,
"hints": [
"advanced"
]
},
"numItemsPerQuery": {
"type": "integer",
"title": "Number of Items per Query to Recommend",
"description": "Batch compute and store this many item recommendations per query",
"default": 10,
"hints": [
"advanced"
]
},
"initialRank": {
"type": "integer",
"title": "Recommender Rank",
"description": "Number of user/item factors in the recommender decomposition (or starting guess for it, if doing parameter grid search)",
"default": 100,
"hints": [
"advanced"
]
},
"initialBlocks": {
"type": "integer",
"title": "Training Block Size",
"description": "Number of sub-matrix blocks to break the training data into (default: -1, for auto-sizing)",
"default": -1,
"hints": [
"hidden"
]
},
"maxTrainingIterations": {
"type": "integer",
"title": "Maximum Training Iterations",
"description": "Maximum number of iterations to use when learning the matrix decomposition",
"default": 10,
"hints": [
"advanced"
]
},
"initialAlpha": {
"type": "number",
"title": "Implicit Preference Confidence",
"description": "Confidence weight (between 0 and 1) to give the implicit preferences (or starting guess, if doing parameter grid search)",
"default": 0.5,
"hints": [
"advanced"
]
},
"initialLambda": {
"type": "number",
"title": "Smoothing",
"description": "Smoothing parameter to avoid overfitting (or starting guess, if doing parameter grid search). Slightly larger value needed for small data sets",
"default": 0.01,
"hints": [
"advanced"
]
},
"gridSearchWidth": {
"type": "integer",
"title": "Grid Search Width",
"description": "Parameter grid search to be done centered around initial parameter guesses, exponential step size, this number of steps (if <= 0, no grid search)",
"default": 1,
"hints": [
"advanced"
]
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 13,
"hints": [
"advanced"
]
},
"implicitRatings": {
"type": "boolean",
"title": "Implicit Preferences",
"description": "Treat training preferences as implicit signals of interest (i.e. clicks or other actions) as opposed to explicit query ratings",
"default": true
},
"alwaysTrain": {
"type": "boolean",
"title": "Force model re-training",
"description": "Even if a model with this modelId exists, re-train if set true",
"default": true
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"query_similarity"
],
"default": "query_similarity",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingDataFilterQuery",
"modelCollection",
"outputItemsForQueriesCollection",
"outputQuerySimCollection",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingSampleFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"queryField",
"itemIdField",
"weightField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"alwaysTrain",
"saveModel",
"gridSearchWidth",
"implicitRatings",
"initialAlpha",
"initialLambda",
"initialRank",
"maxTrainingIterations",
"numItemsPerQuery",
"numSims",
"popularQueryMin"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Content based Recommender",
"description": "Use this job when you want to compute item similarities based on their content such as product descriptions. ",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"outputCollection",
"outputFormat",
"itemIdField",
"contentField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"outputBatchSize": {
"type": "string",
"title": "Output Batch Size",
"description": "Batch size of documents when pushing results to solr",
"default": "15000",
"hints": [
"advanced"
]
},
"unidecodeText": {
"type": "boolean",
"title": "Unidecode Text",
"description": "Select if you want the text to be unidecoded.",
"default": true
},
"lowercaseText": {
"type": "boolean",
"title": "Lowercase Text",
"description": "Select if you want the text to be lowercased.",
"default": true
},
"vectorizationUseDl": {
"type": "boolean",
"title": "Use Deep Learning for vectorization",
"description": "Select if you want to use deep learning as the method for vectorization. You can choose the other methods too in which case an ensemble will be used.",
"default": true
},
"vectorizationUseFasttext": {
"type": "boolean",
"title": "Use Word2Vec for vectorization",
"description": "Select if you want to use word2vec as the method for vectorization. You can choose the other methods too in which case an ensemble will be used. Custom embeddings will be learned. Useful for jargon."
},
"vectorizationUseTfidf": {
"type": "boolean",
"title": "Use Tf-Idf for vectorization",
"description": "Select if you want to use Tf-idf as the method for vectorization. You can choose the other methods too in which case an ensemble will be used."
},
"vectorizationDlEnsembleWeight": {
"type": "number",
"title": "Deep learning vectorization ensemble weight",
"description": "Ensemble weight for deep learning based vectorization if more than one method of vectorization is selected.",
"default": 1
},
"vectorizationFasttextVectorsSize": {
"type": "integer",
"title": "Size of word vectors",
"description": "Word vector dimensions for Word2Vec vectorizer.",
"default": 150,
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextWindowSize": {
"type": "integer",
"title": "Word2Vec window size",
"description": "The window size (context words from [-window, window]) for Word2Vec.",
"default": 5,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextEpochs": {
"type": "integer",
"title": "Word2Vec training epochs",
"description": "Number of epochs to train custom Word2Vec embeddings.",
"default": 15,
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextSkipGram": {
"type": "boolean",
"title": "Use SkipGram model",
"description": "Whether to use skip gram for training. If unchecked, CBOW will be used.",
"default": true,
"hints": [
"hidden"
]
},
"vectorizationFasttextMinCount": {
"type": "integer",
"title": "Min count of words",
"description": "Minimum times a token needs to occur in the text to be considered for the vocab.",
"default": 1,
"hints": [
"hidden"
],
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextMaxVocabSize": {
"type": "integer",
"title": "Max vocab size",
"description": "Maximum number of tokens to consider for the vocab. Less frequent tokens will be omitted.",
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextUseSubwordNgram": {
"type": "boolean",
"title": "Use subword ngrams",
"description": "Whether to use subword (character) ngrams.",
"default": true,
"hints": [
"hidden"
]
},
"vectorizationFasttextMinNgram": {
"type": "integer",
"title": "Min Ngram size",
"description": "Minimum size for ngrams generated.",
"default": 3,
"hints": [
"hidden"
],
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextMaxNgram": {
"type": "integer",
"title": "Max Ngram size",
"description": "Maximum size for ngrams generated.",
"default": 6,
"hints": [
"hidden"
],
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextEnsembleWeight": {
"type": "number",
"title": "Word2Vec vectorization ensemble weight",
"description": "Ensemble weight for Fasttext based vectorization if more than one method of vectorization is selected.",
"default": 1
},
"vectorizationTfidfUseCharacters": {
"type": "boolean",
"title": "Use characters ngrams",
"description": "Whether to use characters. By default words are used."
},
"vectorizationTfidfFilterStopwords": {
"type": "boolean",
"title": "Filter stopwords",
"description": "Whether to filter out stopwords before generating Tf-Idf weights.",
"default": true
},
"vectorizationTfidfMinDf": {
"type": "number",
"title": "Min Document Frequency",
"description": "Minimum Df for token to be considered.",
"hints": [
"hidden"
]
},
"vectorizationTfidfMaxDf": {
"type": "number",
"title": "Max Document Frequency",
"description": "Maximum Df for token to be considered.",
"default": 1,
"hints": [
"hidden"
]
},
"vectorizationTfidfMinNgram": {
"type": "integer",
"title": "Min Ngram size",
"description": "Minimum Ngram size to be used.",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationTfidfMaxNgram": {
"type": "integer",
"title": "Max Ngram size",
"description": "Maximum Ngram size to be used.",
"default": 3,
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationTfIdfMaxVocabSize": {
"type": "integer",
"title": "Max vocab size",
"description": "Maximum number of tokens to consider for the vocab. Less frequent tokens will be omitted.",
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationTfidfEnsembleWeight": {
"type": "number",
"title": "Tf-Idf vectorization ensemble weight",
"description": "Ensemble weight for Tf-Idf based vectorization if more than one method of vectorization is selected.",
"default": 1
},
"topKAnn": {
"type": "integer",
"title": "No. of Item Recs to compute for ensemble",
"description": "This is used to fetch additional recommendations so that the value specified for the Number of User Recommendations to Compute is most likely satisfied after filtering. This is normally set to 10 * (No. of item recommendations to compute)",
"default": 100,
"minimum": 1,
"exclusiveMinimum": false
},
"jobRunName": {
"type": "string",
"title": "Job Run Name",
"description": "Identifier for this job run. Use it to filter recommendations from particular runs",
"hints": [
"advanced"
]
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"outputCollection": {
"type": "string",
"title": "Output data path",
"description": "Solr collection or cloud storage path where output data is to be written."
},
"outputFormat": {
"type": "string",
"title": "Output data format",
"description": "The format of the output data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"partitionFields": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"numSimsPerItem": {
"type": "integer",
"title": "No. of Item Recs to Compute",
"description": "Number of recommendations that will be saved per item.",
"default": 10,
"minimum": 1,
"exclusiveMinimum": false
},
"deleteOldRecs": {
"type": "boolean",
"title": "Delete Old Recommendations",
"description": "Should previous recommendations be deleted. If this box is unchecked, then old recommendations will not be deleted but new recommendations will be appended with a different Job ID. Both sets of recommendations will be contained within the same collection. Will only work when output path is solr.",
"default": true
},
"excludeFromDeleteFilter": {
"type": "string",
"title": "Exclude from Delete Filter",
"description": "If the 'Delete Old Recommendations' flag is enabled, then use this query filter to identify existing recommendation docs to exclude from delete. The filter should identify recommendation docs you want to keep.",
"hints": [
"advanced"
]
},
"metadataCategoryFields": {
"type": "array",
"title": "Metadata fields for item-item evaluation",
"description": "These fields will be used for item-item evaluation and for determining if the recommendation pair belongs to the same category.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`.",
"hints": [
"code/sql",
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Choose a fraction of the data for training.",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"itemIdField": {
"type": "string",
"title": "Training Item Id Field",
"description": "Field name containing stored item ids",
"default": "item_id_s",
"minLength": 1
},
"contentField": {
"type": "array",
"title": "Training Content Field",
"description": "Field name containing item content such as product description",
"items": {
"type": "string"
}
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 12345,
"hints": [
"advanced"
]
},
"itemMetadataFields": {
"type": "array",
"title": "Item Metadata Fields",
"description": "List of item metadata fields to include in the recommendation output documents.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"vectorizationDlBatchSize": {
"type": "integer",
"title": "Batch size to compute encodings",
"description": "Compute encodings in batches in case hardware out of memory.",
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"performANN": {
"type": "boolean",
"title": "Perform approximate nearest neighbor search",
"description": "Whether to perform approximate nearest neighbor search (ANN). ANN will drastically reduce training time, but accuracy will drop a little. Disable only if dataset is very small.",
"default": true
},
"maxNeighbors": {
"type": "integer",
"title": "Max neighbors for indexing",
"description": "If perform ANN, size of the potential neighbors for the indexing phase. Higher value leads to better recall and shorter retrieval times (at the expense of longer indexing time).Reasonable range: 5~100",
"hints": [
"advanced"
],
"maximum": 100,
"exclusiveMaximum": false,
"minimum": 5,
"exclusiveMinimum": false
},
"searchNN": {
"type": "integer",
"title": "Search Depth",
"description": "If perform ANN, the depth of search used to find neighbors. Higher value improves recall at the expense of longer retrieval time.Reasonable range: 100~2000",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"indexNN": {
"type": "integer",
"title": "Indexing Depth",
"description": "If perform ANN, the depth of constructed index. Higher value improves recall at the expense of longer indexing time.Reasonable range: 100~2000",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-item-recommender-content"
],
"default": "argo-item-recommender-content",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"outputCollection",
"outputFormat",
"outputBatchSize",
"secretName",
"partitionFields"
]
},
{
"label": "Training Data Settings",
"properties": [
"trainingDataFilterQuery",
"trainingSampleFraction",
"randomSeed",
"itemIdField",
"contentField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"numSimsPerItem",
"topKAnn",
"performANN",
"maxNeighbors",
"searchNN",
"indexNN",
"unidecodeText",
"lowercaseText",
"deleteOldRecs",
"excludeFromDeleteFilter"
]
},
{
"label": "Vectorization Parameters",
"properties": [
"vectorizationUseDl",
"vectorizationUseFasttext",
"vectorizationUseTfidf"
]
},
{
"label": "Deep Learning Vectorization Parameters",
"properties": [
"vectorizationDlBatchSize",
"vectorizationDlEnsembleWeight"
]
},
{
"label": "Word2Vec Vectorization Parameters",
"properties": [
"vectorizationFasttextVectorsSize",
"vectorizationFasttextWindowSize",
"vectorizationFasttextEpochs",
"vectorizationFasttextMinNgram",
"vectorizationFasttextEnsembleWeight",
"vectorizationFasttextMaxVocabSize"
]
},
{
"label": "Tf-Idf Vectorization Parameters",
"properties": [
"vectorizationTfidfUseCharacters",
"vectorizationTfidfFilterStopwords",
"vectorizationTfidfMinNgram",
"vectorizationTfidfMaxNgram",
"vectorizationTfIdfMaxVocabSize",
"vectorizationTfidfEnsembleWeight"
]
},
{
"label": "Item Metadata Settings",
"properties": [
"itemMetadataFields",
"metadataCategoryFields"
]
}
]
},
{
"type": "object",
"title": "Delete Seldon Core Model Deployment",
"description": "Removes a Seldon Core deployment from the cluster",
"required": [
"id",
"modelName",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Seldon Core deployment to delete",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-delete-model"
],
"default": "argo-delete-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Logistic Regression Classifier Training (deprecated)",
"description": "Use this job when you have training data and you want to train a logistic regression model to classify text into groups. Deprecated as of Fusion 5.2.0 and will be removed in a future release; use the Classification job instead.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"trainingLabelField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"advanced",
"code/json",
"lengthy"
]
},
"withIdf": {
"type": "boolean",
"title": "IDF Weighting",
"description": "Weight vector components based on inverse document frequency",
"default": true,
"hints": [
"advanced"
]
},
"w2vDimension": {
"type": "integer",
"title": "Word2Vec Dimension",
"description": "Word-vector dimensionality to represent text (choose > 0 to use)",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec Window Size",
"description": "The window size (context words from [-window, window]) for word2vec",
"default": 5,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxSentenceLength": {
"type": "integer",
"title": "Max Word2Vec Sentence Length",
"description": "Sets the maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength` size.",
"default": 1000,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxIter": {
"type": "integer",
"title": "Max Word2Vec Iterations",
"description": "Maximum number of iterations of the word2vec training",
"default": 1,
"hints": [
"advanced"
]
},
"w2vStepSize": {
"type": "number",
"title": "Word2Vec Step Size",
"description": "Training parameter for word2vec convergence (change at your own peril)",
"default": 0.025,
"hints": [
"advanced"
],
"minimum": 0.005,
"exclusiveMinimum": false
},
"minDF": {
"type": "number",
"title": "Minimum Term Document Frequency",
"description": "To be kept, terms must occur in at least this number of documents (if > 1.0), or at least this fraction of documents (if <= 1.0)",
"default": 0,
"hints": [
"advanced"
]
},
"maxDF": {
"type": "number",
"title": "Max Term Document Frequency",
"description": "To be kept, terms must occur in no more than this number of documents (if > 1.0), or no more than this fraction of documents (if <= 1.0)",
"default": 1,
"hints": [
"advanced"
]
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"predictedLabelField": {
"type": "string",
"title": "Predicted Label Field",
"description": "Solr field which will contain labels when classifier is applied to documents",
"default": "labelPredictedByFusionModel",
"hints": [
"advanced"
]
},
"serializeAsMleap": {
"type": "boolean",
"title": "Serialize as Mleap Bundle",
"description": "Serialize the output model as Mleap Bundle",
"default": true,
"hints": [
"hidden"
]
},
"minSparkPartitions": {
"type": "integer",
"title": "Minimum Number of Spark Partitions",
"description": "Minimum number of Spark partitions for training job.",
"default": 200,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"overwriteExistingModel": {
"type": "boolean",
"title": "Overwrite existing model",
"description": "If a model exists in the model store, overwrite when this job runs",
"default": true,
"hints": [
"advanced"
]
},
"trainingLabelField": {
"type": "string",
"title": "Label Field",
"description": "Solr field containing labels for training instances (should be single-valued strings)"
},
"gridSearch": {
"type": "boolean",
"title": "Grid Search with Cross Validation",
"description": "Perform grid search to optimize hyperparameters",
"default": false
},
"evaluationMetricType": {
"type": "string",
"title": "Evaluation Metric Type",
"description": "Optimize hyperparameter search over one of [binary, multiclass, regression] metrics, or 'none'",
"enum": [
"binary",
"multiclass",
"regression",
"none"
],
"default": "none",
"hints": [
"advanced"
]
},
"autoBalanceClasses": {
"type": "boolean",
"title": "Auto-balance training classes",
"description": "Ensure that all classes of training data have the same size",
"default": true,
"hints": [
"advanced"
]
},
"minTrainingSamplesPerClass": {
"type": "integer",
"title": "Minimum Labeled Class Size",
"description": "Ensure that all classes of training data have at least this many examples",
"default": 100,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"makeOtherClass": {
"type": "boolean",
"title": "Make 'Other' Class",
"description": "Create a label class 'Other' which contains all examples not in a class large enough to train on",
"default": true,
"hints": [
"advanced"
]
},
"otherClassName": {
"type": "string",
"title": "'Other' class name",
"description": "Label class name for the catch-all 'Other' class",
"default": "Other",
"hints": [
"advanced"
],
"minLength": 1
},
"regularizationWeight": {
"type": "number",
"title": "Regularization weight",
"description": "Degree of regularization to use when training (L2 lambda parameter if elasticNetWeight = 0)",
"default": 0.01,
"maximum": 1,
"exclusiveMaximum": false,
"minimum": 0.000001,
"exclusiveMinimum": false
},
"elasticNetWeight": {
"type": "number",
"title": "Elastic net weight",
"description": "Value between 0 and 1 to interpolate between ridge (0.0) and lasso (1.0) regression",
"default": 0,
"maximum": 1,
"exclusiveMaximum": false
},
"maxIters": {
"type": "integer",
"title": "Maximum number of iterations",
"description": "Maximum number of iterations to perform before halting, even if the convergence criterion has not been met.",
"default": 10
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"logistic_regression_classifier_trainer"
],
"default": "logistic_regression_classifier_trainer",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"predictedLabelField",
"trainingLabelField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"w2vDimension",
"w2vWindowSize",
"w2vMaxIter",
"w2vMaxSentenceLength",
"w2vStepSize",
"withIdf",
"maxDF",
"minDF",
"norm",
"autoBalanceClasses",
"evaluationMetricType",
"minTrainingSamplesPerClass",
"otherClassName",
"makeOtherClass",
"gridSearch",
"elasticNetWeight",
"maxIters",
"regularizationWeight"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Create Ray Model Deployment",
"description": "Deploys a Ray Model into the Fusion cluster",
"required": [
"id",
"deployModelName",
"modelCpuLimit",
"modelMemoryLimit",
"modelDockerRepo",
"modelDockerImage",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"deployModelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Ray deployment to deploy (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"modelMinReplicas": {
"type": "integer",
"title": "Model min replicas",
"description": "Minimum number of replicas of the model to be deployed",
"default": 1
},
"modelMaxReplicas": {
"type": "integer",
"title": "Model max replicas",
"description": "Maximum number of replicas of the model to be deployed",
"default": 1
},
"modelCpuLimit": {
"type": "number",
"title": "Model CPU limit",
"description": "Maximum number of CPUs that can be allocated to a single model replica",
"default": 1
},
"modelMemoryLimit": {
"type": "string",
"title": "Model memory limit",
"description": "Maximum amount of memory that can be allocated to a single model replica",
"default": "1Gi",
"pattern": "^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$"
},
"modelImportPath": {
"type": "string",
"title": "Ray deployment import path",
"description": "The path to your top-level Ray Serve deployment (or the same path passed to `serve run`)",
"default": "deployment:app"
},
"modelDockerRepo": {
"type": "string",
"title": "Docker repository",
"description": "Defines the Docker repository where the model image is located."
},
"modelDockerImage": {
"type": "string",
"title": "Image name",
"description": "Name of the model's docker image"
},
"modelDockerSecret": {
"type": "string",
"title": "Kubernetes secret name for model repo",
"description": "Defines the Kubernetes secret to be used with the Docker repository"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-deploy-ray-model"
],
"default": "argo-deploy-ray-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
}
]
}Get the configuration schemas for all job types.
import requests
url = "https://{FUSION HOST}/api/spark/schema"
headers = {"Authorization": "Basic <encoded-value>"}
response = requests.get(url, headers=headers)
print(response.text){
"type": "object",
"properties": {},
"oneOf": [
{
"type": "object",
"title": "Query-to-Query Session Based Similarity",
"description": "Use this job to to batch compute query-query similarities using a co-occurrence based approach",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"docIdField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input Collection",
"description": "Collection containing queries, document id and event counts. Can be either signal aggregation collection or raw signals collection."
},
"fieldToVectorize": {
"type": "string",
"title": "Query Field Name",
"description": "Field containing queries.",
"default": "query_s",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Data filter query",
"description": "Solr query to additionally filter the input collection.",
"default": "*:*",
"hints": [
"dummy"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output collection",
"description": "Collection to store synonym and similar query pairs.",
"hints": [
"dummy"
]
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"dummy",
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"specialCharsFilterString": {
"type": "string",
"title": "Special characters to be filtered out",
"description": "String of special characters to be filtered from queries.",
"default": "~!@#$^%&*\\(\\)_+={}\\[\\]|;:\"'<,>.?`/\\\\-",
"hints": [
"advanced"
]
},
"minQueryLength": {
"type": "integer",
"title": "Minimum query length",
"description": "Queries below this length (in number of characters) will not be considered for generating recommendations.",
"default": 3,
"minimum": 1,
"exclusiveMinimum": false
},
"maxQueryLength": {
"type": "integer",
"title": "Maximum query length",
"description": "Queries above this length will not be considered for generating recommendations.",
"default": 50,
"minimum": 1,
"exclusiveMinimum": false
},
"countField": {
"type": "string",
"title": "Event Count Field Name",
"description": "Solr field containing number of events (e.g., number of clicks).",
"default": "count_i"
},
"docIdField": {
"type": "string",
"title": "Document id Field Name",
"description": "Solr field containing document id that user clicked.",
"default": "doc_id_s"
},
"overlapThreshold": {
"type": "number",
"title": "Query Similarity Threshold",
"description": "The threshold above which query pairs are consider similar. Decreasing the value can fetch more pairs at the expense of quality.",
"default": 0.3,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"minQueryCount": {
"type": "integer",
"title": "Query Clicks Threshold",
"description": "The minimum number of clicked documents needed for comparing queries.",
"default": 1,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"overlapEnabled": {
"type": "boolean",
"title": "Boost on token overlap",
"description": "Maximize score for query pairs with overlapping tokens by setting score to 1.",
"default": true,
"hints": [
"advanced"
]
},
"tokenOverlapValue": {
"type": "number",
"title": "Minimum match for token overlap",
"description": "Minimum amount of overlap to consider for boosting. To specify overlap in terms of ratio, specify a value in (0, 1). To specify overlap in terms of exact count, specify a value >= 1. If value is 0, boost will be applied if one query is a substring of its pair.Stopwords are ignored while counting overlaps.",
"default": 1,
"hints": [
"advanced"
]
},
"sessionIdField": {
"type": "string",
"title": "Session/User ID field",
"description": "If session id is not available, specify user id field instead. If this field is left blank, session based recommendations will be disabled.",
"default": "session_id_s"
},
"minPairOccCount": {
"type": "integer",
"title": "Minimum query-recommendation pair occurrence count",
"description": "Minimum number of times a query pair must be generated to be considered valid.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"stopwordsBlobName": {
"type": "string",
"title": "Stopwords Blob Store",
"description": "Name of the stopwords blob resource. This is a .txt file with one stopword per line. By default the file is called stopwords/stopwords_nltk_en.txt however a custom file can also be used. Check documentation for more details on format and uploading to blob store.",
"default": "stopwords/stopwords_en.txt",
"reference": "blob",
"blobType": "file:spark"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"similar_queries"
],
"default": "similar_queries",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"countField",
"docIdField",
"sessionIdField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"minQueryLength",
"maxQueryLength",
"specialCharsFilterString",
"stopwordsBlobName",
"overlapThreshold",
"overlapEnabled",
"tokenOverlapValue",
"minQueryCount",
"minPairOccCount"
]
}
]
},
{
"type": "object",
"title": "Smart Answers Coldstart Training (deprecated)",
"description": "Trains Smart Answers model on a cold start (unsupervised) basis with with pre-trained or trained embeddings and deploys the trained model to the ML Model Service",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"textColName",
"deployModelName",
"modelBase",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
"hints": [
"code/sql",
"advanced"
]
},
"textColName": {
"type": "string",
"title": "Field which contains the content documents",
"description": "Field which contains the documents that will be used to learn about the vocabulary. If multiple fields, please separate them by comma, e.g. question,answer."
},
"deployModelName": {
"type": "string",
"title": "Model Deployment Name",
"description": "Name of the model to be used for deployment (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"modelBase": {
"type": "string",
"title": "Model base",
"description": "Specify one of these custom embeddings: ['word_custom', 'bpe_custom'] or choose one of the included pre-trained embeddings / models.",
"enum": [
"word_custom",
"bpe_custom",
"word_en_300d_2M",
"bpe_en_300d_10K",
"bpe_en_300d_200K",
"bpe_ja_300d_100K",
"bpe_ko_300d_100K",
"bpe_zh_300d_50K",
"bpe_multi_300d_320K",
"distilbert_en",
"distilbert_multi",
"biobert_v1.1"
],
"default": "word_en_300d_2M"
},
"testMode": {
"type": "boolean",
"title": "Test Mode",
"description": "If set to true, then the training will exit after the first iteration. Useful for ensuring that the end-to-end pipeline is working",
"default": false,
"hints": [
"hidden"
]
},
"modelReplicas": {
"type": "integer",
"title": "Model replicas",
"description": "How many replicas of the model should be deployed by Seldon Core",
"default": 1
},
"w2vEpochs": {
"type": "integer",
"title": "Word2Vec training epochs",
"description": "Number of epochs to train custom Word2Vec embeddings",
"default": 15,
"hints": [
"advanced"
]
},
"w2vVectorSize": {
"type": "integer",
"title": "Size of word vectors",
"description": "Word-vector dimensionality to represent text (suggested dimension ranges: 100~300)",
"default": 150,
"hints": [
"advanced"
]
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec window size",
"description": "The window size (context words from [-window, window]) for Word2Vec",
"default": 8,
"hints": [
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "The proportion of data to be sampled from the full dataset. Use a value between 0 and 1 for a proportion (e.g. 0.5 for 50%), or for a specific number of examples, use an integer larger than 1. Leave blank for no sampling",
"hints": [
"advanced"
]
},
"seed": {
"type": "integer",
"title": "Seed",
"description": "Random seed for sampling",
"default": 12345,
"hints": [
"hidden"
]
},
"minTokensNum": {
"type": "integer",
"title": "Minimum number of words in doc",
"description": "Drop document if the total words is lower than this value",
"default": 1,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"maxTokensNum": {
"type": "integer",
"title": "Maximum number of words in doc",
"description": "Drop document if the total words is greater than this value",
"default": 5000,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"lowerCases": {
"type": "boolean",
"title": "Lower case all words",
"description": "Whether to lower case all words in training, i.e. whether to treat upper case and lower case words equally. Only utilized for custom embeddings or for the default model base: word_en_300d_2M.",
"default": true
},
"maxVocabSize": {
"type": "integer",
"title": "Maximum vocabulary size",
"description": "Maximum number of words in vocabulary, words will be trimmed if frequency is too low",
"default": 100000,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"extraTrainingArgs": {
"type": "string",
"title": "Extra training args for Python scripts",
"description": "Add any additional arguments for the Python training scripts in this field",
"hints": [
"hidden"
]
},
"maxLen": {
"type": "integer",
"title": "Max Length",
"description": "Max length of question/answer by number of tokens"
},
"infBatch": {
"type": "integer",
"title": "Inference batch size",
"description": "The batch size used for encoding during the training",
"hints": [
"advanced"
]
},
"numClusters": {
"type": "integer",
"title": "Number of clusters",
"description": "DEPRECATED: please, consider using Milvus for fast dense vector similarity search. Number of clusters to be used for fast dense vector retrieval. Note no clustering will be applied if this is set to 0. If left blank, cluster count will be inferred by the job depending on the data",
"default": 0,
"hints": [
"advanced"
]
},
"topKClusters": {
"type": "integer",
"title": "Top k of clusters to return",
"description": "How many closest clusters the model can find for each query. At retrieval time, all answers in top k nearest clusters will be returned and reranked",
"default": 10,
"hints": [
"advanced"
]
},
"unidecode": {
"type": "boolean",
"title": "Apply unicode decoding",
"description": "Use Unidecode library to transform Unicode input into ASCII transliterations. Only utilized for custom embeddings or for the default model base: word_en_300d_2M",
"default": true
},
"globalPoolType": {
"type": "string",
"title": "Global Pool Type",
"description": "Determines how token vectors should be aggregated to obtain final content vector. Must be one of: [avg, max].",
"enum": [
"avg",
"max"
],
"default": "avg",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-qna-coldstart"
],
"default": "argo-qna-coldstart",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"textColName",
"deployModelName",
"modelReplicas",
"secretName",
"testMode"
]
},
{
"label": "Data Preprocessing",
"properties": [
"trainingDataFilterQuery",
"trainingSampleFraction",
"seed",
"minTokensNum",
"maxTokensNum",
"lowerCases",
"unidecode",
"maxVocabSize"
]
},
{
"label": "Custom Embeddings Initialization",
"properties": [
"w2vEpochs",
"w2vVectorSize",
"w2vWindowSize"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"maxLen",
"infBatch",
"numClusters",
"topKClusters",
"globalPoolType"
]
}
]
},
{
"type": "object",
"title": "Ranking Metrics",
"description": "use this job to calculate relevance metrics (nDCG etc..) by replaying ground truth queries (see ground truth job) against catalog data using variants from an experiment.",
"required": [
"id",
"groundTruthConfig",
"rankingExperimentConfig",
"outputCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"groundTruthConfig": {
"type": "object",
"title": "Configure ground truth dataset",
"description": "Configure properties for Ground truth dataset",
"required": [
"inputCollection"
],
"properties": {
"inputCollection": {
"type": "string",
"title": "Ground Truth Input Collection",
"description": "Input collection representing ground truth dataset",
"minLength": 1
},
"filterQueries": {
"type": "array",
"title": "Filter queries",
"description": "Solr filter queries to apply against Ground truth collection",
"default": [
"type:ground_truth"
],
"hints": [
"advanced"
],
"items": {
"type": "string",
"default": "[\"type:ground_truth\"]"
}
},
"queryField": {
"type": "string",
"title": "Query field",
"description": "Query field in the collection",
"default": "query",
"hints": [
"advanced"
]
},
"docIdField": {
"type": "string",
"title": "Doc ID field",
"description": "Field containing ranked doc id's",
"default": "docId",
"hints": [
"advanced"
]
},
"weightField": {
"type": "string",
"title": "Weight Field",
"description": "Field representing the weight of document to the query",
"default": "weight",
"hints": [
"advanced"
]
}
}
},
"rankingExperimentConfig": {
"type": "object",
"title": "Configure experiment",
"description": "Configure properties for the experiment",
"properties": {
"inputCollection": {
"type": "string",
"title": "Input Collection",
"description": "Collection to run the experiment on",
"hints": [
"advanced"
],
"minLength": 1
},
"queryPipelines": {
"type": "array",
"title": "Query pipelines",
"description": "Pipeline variants for experiment",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"docIdField": {
"type": "string",
"title": "Doc Id Field",
"description": "Doc id field to retrieve values (Must return values that match the ground truth data)",
"default": "id",
"hints": [
"advanced"
]
},
"experimentId": {
"type": "string",
"title": "Experiment ID",
"description": "Calculate ranking metrics using variants from experiment",
"minLength": 1
},
"experimentObjectiveName": {
"type": "string",
"title": "Experiment metric name",
"description": "Experiment objective name",
"minLength": 1
},
"defaultProfile": {
"type": "string",
"title": "Default Query Profile",
"description": "Default query profile to use if not specified in experiment variants"
}
}
},
"outputCollection": {
"type": "string",
"title": "Output collection",
"description": "Output collection to save the ranking metrics to",
"minLength": 1
},
"rankingPositionK": {
"type": "integer",
"title": "Ranking Position @K",
"description": "Ranking position at K for metrics calculation",
"default": 10,
"hints": [
"advanced"
]
},
"metricsPerQuery": {
"type": "boolean",
"title": "Calculate metrics per query",
"description": "Calculate ranking metrics per each query in ground truth set and save them to Solr collection",
"default": true,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"ranking_metrics"
],
"default": "ranking_metrics",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Ground Truth Parameters",
"properties": [
"groundTruthConfig"
]
},
{
"label": "Ranking Experiment Parameters",
"properties": [
"rankingExperimentConfig"
]
}
]
},
{
"type": "object",
"title": "Data Augmentation (deprecated)",
"description": "Use this job to perform Text Augmentation",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"outputCollection",
"outputFormat",
"includeOriginalData",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Input format",
"description": "The format of the training data - solr, parquet etc.",
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
"hints": [
"code/sql",
"advanced"
]
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 12345,
"hints": [
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Choose a fraction of the data for training.",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"batchSize": {
"type": "string",
"title": "Batch Size",
"description": "If writing to solr, this field defines the batch size for documents to be pushed to solr.",
"default": "15000",
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output path",
"description": "Output collection to store generated augmented data.",
"minLength": 1
},
"outputFormat": {
"type": "string",
"title": "Output Format",
"description": "The format of the output data - solr, parquet etc.",
"minLength": 1
},
"partitionFields": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"backTranslations": {
"type": "array",
"title": "Back Translation",
"description": "Augment data via translation to a different language and then translating back to original language. Chain of languages can be used for translation. Works at sentence level for medium-long length text. GPU recommended and will be used when available.",
"items": {
"type": "object",
"required": [
"fieldname",
"inputLanguage"
],
"properties": {
"fieldname": {
"type": "string",
"title": "Field Name",
"description": "Name of the input field to augment.",
"minLength": 1
},
"inputLanguage": {
"type": "string",
"title": "Input data Language",
"description": "Language of input data.",
"enum": [
"English",
"French",
"German",
"Italian",
"Spanish",
"Dutch",
"Polish",
"Hebrew",
"Ukrainian",
"Chinese",
"Japanese",
"Korean"
],
"minLength": 1
},
"intermediateLanguage": {
"type": "string",
"title": "Intermediate Language",
"description": "Specify languages in order to be used in back translation separated by comma. Only use languages present in input data language dropdown. Bigger chains will take more time to augment. ",
"default": "German",
"pattern": "((?:English|German|French|Italian|Spanish|Dutch|Polish|Ukrainian|Hebrew|Chinese|Japanese|Korean)*(\\s)*(,)*(\\s)*){0,12}"
},
"batchSize": {
"type": "integer",
"title": "Batch Size",
"description": "Number of input data samples to back-translate at a time. Important if Use GPU is checked to avoid memory overflow.",
"default": 256,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"beamSize": {
"type": "integer",
"title": "Beam Size",
"description": "Number of beams to evaluate during translation. Use higher number if translation is poor. Higher number will increase execution time and memory use.",
"default": 1,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"minSentenceLength": {
"type": "integer",
"title": "Min translation length (tokens)",
"description": "Do not back translate sentences shorter than specified length in tokens. If the value is more than max translation length, then max translation length will be used.",
"default": 40,
"hints": [
"advanced"
],
"maximum": 510,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"maxSentenceLength": {
"type": "integer",
"title": "Max translation length (tokens)",
"description": "Do not back translate sentences longer than specified length in tokens. If the value is less than min translation length, hen min translation length will be used.",
"default": 240,
"hints": [
"advanced"
],
"maximum": 510,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
}
}
}
},
"keyStrokeMisspellings": {
"type": "array",
"title": "Keystroke Misspellings",
"description": "Augment data via insertion, substitution, swapping and deletion of characters based on keyboard layout. Useful for short text.",
"items": {
"type": "object",
"required": [
"fieldname",
"inputLanguage"
],
"properties": {
"fieldname": {
"type": "string",
"title": "Field Name",
"description": "Name of the input field to augment.",
"minLength": 1
},
"inputLanguage": {
"type": "string",
"title": "Input data Language",
"description": "Language of input data.",
"enum": [
"English",
"French",
"German",
"Italian",
"Spanish",
"Dutch",
"Polish",
"Hebrew",
"Ukrainian"
],
"minLength": 1
},
"minCharAugment": {
"type": "integer",
"title": "Minimum Chars to Augment",
"description": "Minimum number of characters to augment in each word. If the value is more than Maximum Chars to Augment, then Maximum Chars to Augment will be used.",
"default": 1,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"maxCharAugment": {
"type": "integer",
"title": "Maximum Chars to Augment",
"description": "Maximum number of characters to augment in each word. If the value is more than Minimum Chars to Augment, then Minimum Chars to Augment will be used.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"minWordsToAugment": {
"type": "integer",
"title": "Min words to Augment",
"description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"maxWordsToAugment": {
"type": "integer",
"title": "Max words to Augment",
"description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
"default": 10,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"wordPercentageToAugment": {
"type": "number",
"title": "Percentage words to Augment",
"description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
"default": 0.2,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"keywordsBlobName": {
"type": "string",
"title": "Keystroke Mapping",
"description": "Keystroke Mapping for required language in JSON format from blob store.",
"hints": [
"advanced"
],
"reference": "blob",
"blobType": "file:spark"
}
}
}
},
"synonymSubstitutions": {
"type": "array",
"title": "Synonym Substitution",
"description": "Augment data via substituting words using synonyms from wordnet or user supplied dictionary. Useful for short, medium and long text. Faster and less resource intensive than back translation.",
"items": {
"type": "object",
"required": [
"fieldname",
"inputLanguage"
],
"properties": {
"fieldname": {
"type": "string",
"title": "Field Name",
"description": "Name of the input field to augment.",
"minLength": 1
},
"inputLanguage": {
"type": "string",
"title": "Input data Language",
"description": "Language of input data.",
"enum": [
"English",
"French",
"German",
"Italian",
"Spanish",
"Dutch",
"Polish",
"Hebrew",
"Chinese",
"Japanese"
],
"minLength": 1
},
"minWordsToAugment": {
"type": "integer",
"title": "Min words to Augment",
"description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"maxWordsToAugment": {
"type": "integer",
"title": "Max words to Augment",
"description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
"default": 10,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"wordPercentageToAugment": {
"type": "number",
"title": "Percentage of words to Augment",
"description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
"default": 0.2,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"stopwordsBlobName": {
"type": "string",
"title": "Synonym Dictionary Name",
"description": "Wordnet format dictionary to use from blob store",
"hints": [
"advanced"
],
"reference": "blob",
"blobType": "file:spark"
}
}
}
},
"splitWords": {
"type": "array",
"title": "Split Words",
"description": "Augment data via splitting some words. Useful for short, medium and long text.",
"items": {
"type": "object",
"required": [
"fieldname",
"inputLanguage"
],
"properties": {
"fieldname": {
"type": "string",
"title": "Field Name",
"description": "Name of the input field to augment.",
"minLength": 1
},
"inputLanguage": {
"type": "string",
"title": "Input data Language",
"description": "Language of input data.",
"enum": [
"English",
"French",
"German",
"Italian",
"Spanish",
"Dutch",
"Polish"
],
"minLength": 1
},
"minWordLength": {
"type": "integer",
"title": "Minimum Word Length",
"description": "Do not augment words less than this length (in characters). If the value is more than maximum word length, then maximum word length will be used.",
"default": 4,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"minWordsToAugment": {
"type": "integer",
"title": "Min words to Augment",
"description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
"default": 2,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"maxWordsToAugment": {
"type": "integer",
"title": "Max words to Augment",
"description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
"default": 10,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"wordPercentageToAugment": {
"type": "number",
"title": "Percentage of words to Augment",
"description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
"default": 0.2,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
}
}
}
},
"includeOriginalData": {
"type": "boolean",
"title": "Include original data",
"description": "When checked original data will be included in the augmented dataset",
"default": true
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-data-augmentation"
],
"default": "argo-data-augmentation",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"trainingDataFilterQuery",
"trainingSampleFraction",
"randomSeed",
"batchSize",
"outputCollection",
"outputFormat",
"partitionFields",
"secretName",
"includeOriginalData"
]
},
{
"label": "Augmentation Parameters",
"properties": [
"backTranslations",
"keyStrokeMisspellings",
"synonymSubstitutions",
"splitWords"
]
}
]
},
{
"type": "object",
"title": "Create Indexes in Milvus (deprecated)",
"description": "Creates indexes for specified collections in Milvus",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"indexes-list": {
"type": "array",
"title": "Indexes",
"description": "List of the indexes that should be created with corresponding params.",
"items": {
"type": "object",
"required": [
"milvusCollectionName",
"indexType"
],
"properties": {
"milvusCollectionName": {
"type": "string",
"title": "Collection Name",
"description": "Name of the collection in Milvus in which index should be created"
},
"indexType": {
"type": "string",
"title": "Index Type",
"description": "Index type which should be create for specified collection",
"enum": [
"FLAT",
"IVFLAT",
"IVF_SQ8",
"RNSG",
"IVF_SQ8H",
"IVF_PQ",
"HNSW",
"ANNOY"
],
"default": "HNSW"
},
"indexParams": {
"type": "array",
"title": "Index Parameters",
"description": "Parameters to be used to create index in Milvus. Specific to the chosen IndexType. For example, good starting values might be [M=36, efConstruction=500] for HNSW index and [nlist=4×sqrt(number of vectors)] for IVF indexes.",
"items": {
"type": "object",
"properties": {
"key": {
"type": "string",
"title": "Milvus Index Param",
"description": "The name of the Milvus index params like M / efConstruction for HNSW or nlist for IVF indexes."
},
"value": {
"type": "integer",
"title": "Value",
"description": "Index param value. For example, good starting values might be [M=36, efConstruction=500] for HNSW index and [nlist=4×sqrt(number of vectors)] for IVF indexes."
}
}
}
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-milvus-create-indexes"
],
"default": "argo-milvus-create-indexes",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Custom Python Job",
"description": "Use this job when you want to run a python/pyspark job",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"script": {
"type": "string",
"title": "Python Script",
"description": "Custom python/pyspark script to be submitted as a Fusion job",
"hints": [
"code/python",
"lengthy"
],
"minLength": 1
},
"resourceName": {
"type": "string",
"title": "Blob Resource (python file)",
"description": "Name of the resource uploaded to Blob store. This should match with the Blob name",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
},
"pythonFiles": {
"type": "array",
"title": "Python Files",
"description": "Blob resource (.zip, .egg, .py files) to place on the PYTHONPATH for Python apps",
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"submitArgs": {
"type": "array",
"title": "Spark args",
"description": "Additional options to pass to the Spark Submit when running this job.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"javaOptions": {
"type": "array",
"title": "Java options",
"description": "Java options to pass to Spark driver/executor",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"verboseReporting": {
"type": "boolean",
"title": "Verbose reporting",
"description": "Enables verbose reporting for SparkSubmit",
"default": true,
"hints": [
"advanced"
]
},
"envOptions": {
"type": "array",
"title": "ENV properties",
"description": "Set environment variables for driver",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"custom_python_job"
],
"default": "custom_python_job",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Head/Tail Analysis (Deprecated)",
"description": "Use this job when you want to compare the head and tail of your queries to find common misspellings and rewritings. See the insights analytics pane for a review of the results of the job. This job is deprecated.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"countField",
"mainType",
"signalTypeField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input Collection",
"description": "Signals collection containing queries and event counts. Raw signals or aggregation collection can be used. If aggregation collection is being used, update the filter query in advanced options",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Query Field Name",
"description": "Field containing the queries",
"default": "query",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Signals data filter query",
"description": "Solr query to use when loading training data if using Solr (e.g. type:click OR type:response), Spark SQL expression for all other data sources",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr collection to store head tail analytics results. Defaults to job reports collection"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"tailRewriteCollection": {
"type": "string",
"title": "Tail Rewrite Collection",
"description": "Collection where tail rewrites are stored.",
"minLength": 1
},
"analyzerConfigQuery": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [ { \"name\": \"StdTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"englishminimalstem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStem\" } ]}",
"hints": [
"lengthy",
"advanced",
"code/json"
],
"minLength": 1
},
"countField": {
"type": "string",
"title": "Event Count Field Name",
"description": "Field containing the number of times an event (like a click) occurs for a particular query; count_i in the raw signal collection or aggr_count_i in the aggregated signal collection.",
"default": "count_i",
"minLength": 1
},
"mainType": {
"type": "string",
"title": "Main Event Type",
"description": "The main signal event type (e.g. click) that head tail analysis is based on. E.g., if main type is click, then head and tail queries are defined by the number of clicks.",
"default": "click",
"minLength": 1
},
"filterType": {
"type": "string",
"title": "Filtering Event Type",
"description": "The secondary event type (e.g. response) that can be used for filtering out rare searches. Note: In order to use the `response` default value, please make sure you have type:response in the input collection. If there is no need to filter on number of searches, please leave this parameter blank.",
"default": "response"
},
"signalTypeField": {
"type": "string",
"title": "Field Name of Signal Type",
"description": "The field name of signal type in the input collection.",
"default": "type"
},
"minCountMain": {
"type": "integer",
"title": "Minimum Main Event Count",
"description": "Minimum number of main events (e.g. clicks after aggregation) necessary for the query to be considered. The job will only analyze queries with clicks greater or equal to this number.",
"default": 1
},
"minCountFilter": {
"type": "integer",
"title": "Minimum Filtering Event Count",
"description": "Minimum number of filtering events (e.g. searches after aggregation) necessary for the query to be considered. The job will only analyze queries that were issued greater or equal to this number of times.",
"default": 20
},
"queryLenThreshold": {
"type": "integer",
"title": "Minimum Query Length ",
"description": "Minimum length of a query to be included for analysis. The job will only analyze queries with length greater than or equal to this value.",
"default": 2
},
"userHead": {
"type": "number",
"title": "Head Count Threshold",
"description": "User defined threshold for head definition. value=-1.0 will allow the program to pick the number automatically. value<1.0 denotes a percentage (e.g 0.1 means put the top 10% of queries into the head), value=1.0 denotes 100% (e.g 1 means put all queries into the head), value>1.0 denotes the exact number of queries to put in the head (e.g 100 means the top 100 queries constitute the head)",
"default": -1,
"hints": [
"advanced"
]
},
"userTail": {
"type": "number",
"title": "Tail Count Threshold",
"description": "User defined threshold for tail definition. value=-1.0 will allow the program to pick the number automatically. value<1.0 denotes a percentage, (e.g 0.1 means put the bottom 10% of queries into the tail) value=1.0 denotes 100% (e.g 1 means put all queries into the tail), value>1.0 denotes the exact number of queries to put into the tail (e.g 100 means the bottom 100 queries constitute the tail).",
"default": -1,
"hints": [
"advanced"
]
},
"topQ": {
"type": "array",
"title": "Top X% Head Query Event Count",
"description": "Compute how many total events come from the top X head queries (Either a number greater than or equal to 1.0 or a percentage of the total number of unique queries)",
"default": [
100,
0.01
],
"hints": [
"advanced"
],
"items": {
"type": "number"
}
},
"trafficPerc": {
"type": "array",
"title": "Number of Queries that Constitute X% of Total Events",
"description": "Compute how many queries constitute each of the specified event portions(E.g., 0.25, 0.50)",
"default": [
0.25,
0.5,
0.75
],
"hints": [
"advanced"
],
"items": {
"type": "number"
}
},
"lastTraffic": {
"type": "array",
"title": "Bottom X% Tail Query Event Count",
"description": "Compute the total number of queries that are spread over each of the specified tail event portions (E.g., 0.01)",
"default": [
0.01
],
"hints": [
"advanced"
],
"items": {
"type": "number"
}
},
"trafficCount": {
"type": "array",
"title": "Event Count Computation Threshold",
"description": "Compute how many queries have events less than each value specified (E.g., a value of 5.0 would return the number of queries that have less than 5 associated events)",
"default": [
5
],
"hints": [
"advanced"
],
"items": {
"type": "number"
}
},
"keywordsBlobName": {
"type": "string",
"title": "Keywords blob name",
"description": "Name of the keywords blob resource. Typically, this should be a csv file uploaded to blob store in a specific format. Check documentation for more details on format and uploading to blob store ",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
},
"lenScale": {
"type": "integer",
"title": "Edit Distance vs String Length Scale",
"description": "A scaling factor used to normalize the length of the query string. This filters head and tail string match based on if edit_dist <= string_length/length_scale. A large value for this factor leads to a shorter spelling list. A smaller value leads to a longer spelling list but may add lower quality corrections.",
"default": 6,
"hints": [
"advanced"
]
},
"overlapThreshold": {
"type": "integer",
"title": "Head and tail Overlap threshold",
"description": "The threshold for the number of overlapping tokens between the head and tail. When a head string and tail string share more tokens than this threshold, they are considered a good match.",
"default": 4,
"hints": [
"advanced"
]
},
"overlapNumBoost": {
"type": "number",
"title": "Token Overlap Number Boost",
"description": "When there are multiple possible head matches for a tail, we rank heads based on: overlapNumBoost * overlapNum + headQueryCountBoost * log(headQueryCount). A big number puts more weight on how many tokens match between the head and tail query strings instead of the number of times a head query appears.",
"default": 10,
"hints": [
"hidden",
"advanced"
]
},
"headQueryCntBoost": {
"type": "number",
"title": "Head query count boost",
"description": "When there are multiple possible head matches for tail, we rank heads based on: overlapNumBoost * overlapNum + headQueryCountBoost * log(headQueryCount). A big number puts more weight on the count head query instead of the number of tokens shared between the head and tail query strings",
"default": 1,
"hints": [
"hidden",
"advanced"
]
},
"tailRewrite": {
"type": "boolean",
"title": "Generate tail rewrite table",
"description": "If true, also generate tail rewrite table, o.w., only get distributions. May need to set it to false in the very first run to help customize head and tail positions.",
"default": true,
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"enableAutoPublish": {
"type": "boolean",
"title": "Enable auto-publishing",
"description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
"default": false,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"headTailAnalysis"
],
"default": "headTailAnalysis",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"signalTypeField",
"mainType",
"filterType",
"countField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"minCountMain",
"minCountFilter",
"tailRewrite",
"userHead",
"userTail",
"lenScale",
"overlapThreshold",
"topQ",
"trafficCount",
"trafficPerc",
"lastTraffic"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfigQuery",
"queryLenThreshold"
]
},
{
"label": "Misc. Parameters",
"properties": [
"keywordsBlobName"
]
}
]
},
{
"type": "object",
"title": "SQL Aggregation",
"description": "Use this job when you want to aggregate your data in some way.",
"required": [
"id",
"inputCollection",
"sql",
"dataFormat",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"inputCollection": {
"type": "string",
"title": "Source Collection",
"description": "Collection containing signals to be aggregated."
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "The collection to write the aggregates to on output. This property is required if the selected output / rollup pipeline requires it (the default pipeline does). A special value of '-' disables the output.",
"hints": [
"advanced"
],
"minLength": 1
},
"rows": {
"type": "integer",
"title": "Batch Size",
"description": "Number of rows to read from the source collection per request.",
"default": 10000,
"hints": [
"advanced"
]
},
"sql": {
"type": "string",
"title": "SQL",
"description": "Use SQL to perform the aggregation. You do not need to include a time range filter in the WHERE clause as it gets applied automatically before executing the SQL statement.",
"hints": [
"lengthy",
"code/sql"
],
"minLength": 1
},
"rollupSql": {
"type": "string",
"title": "Rollup SQL",
"description": "Use SQL to perform a rollup of previously aggregated docs. If left blank, the aggregation framework will supply a default SQL query to rollup aggregated metrics.",
"hints": [
"lengthy",
"code/sql",
"advanced"
],
"minLength": 1
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Additional configuration settings to fine-tune how input records are read for this aggregation.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"sourceCatchup": {
"type": "boolean",
"title": "Aggregate New and Merge with Existing",
"description": "If checked, only aggregate new signals created since the last time the job was successfully run. If there is a record of such previous run then this overrides the starting time of time range set in 'timeRange' property. If unchecked, then all matching signals are aggregated and any previously aggregated docs are deleted to avoid double counting.",
"default": true,
"hints": [
"advanced"
]
},
"sourceRemove": {
"type": "boolean",
"title": "Remove signals from source",
"description": "If checked, remove signals from source collection once aggregation job has finished running.",
"default": false,
"hints": [
"advanced"
]
},
"aggregationTime": {
"type": "string",
"title": "Aggregation Time",
"description": "Timestamp to use for the aggregation results. Defaults to NOW.",
"hints": [
"advanced"
],
"format": "date-time"
},
"referenceTime": {
"type": "string",
"title": "Reference Time",
"description": "Timestamp to use for computing decays and to determine the value of NOW.",
"hints": [
"advanced"
],
"format": "date-time"
},
"skipCheckEnabled": {
"type": "boolean",
"title": "Job Skip Check Enabled?",
"description": "If the catch-up flag is enabled and this field is checked, the job framework will execute a fast Solr query to determine if this run can be skipped.",
"default": true,
"hints": [
"advanced"
]
},
"skipJobIfSignalsEmpty": {
"type": "boolean",
"title": "Skip Job run",
"description": "Skip Job run if signals collection is empty",
"hints": [
"advanced"
]
},
"parameters": {
"type": "array",
"title": "Parameters",
"description": "Other aggregation parameters (e.g. timestamp field etc..).",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"signalTypes": {
"type": "array",
"title": "Signal Types",
"description": "The signal types. If not set then any signal type is selected",
"items": {
"type": "string"
}
},
"selectQuery": {
"type": "string",
"title": "Query",
"description": "The query to select the desired input documents.",
"default": "*:*",
"hints": [
"advanced"
],
"minLength": 1
},
"timeRange": {
"type": "string",
"title": "Time Range",
"description": "The time range to select signals on.",
"hints": [
"advanced"
],
"minLength": 1
},
"useNaturalKey": {
"type": "boolean",
"title": "Use Natural Key?",
"description": "Use a natural key provided in the raw signals data for aggregation, rather than relying on Solr UUIDs. Migrated aggregations jobs from Fusion 4 will need this set to false.",
"default": true,
"hints": [
"advanced"
]
},
"optimizeSegments": {
"type": "integer",
"title": "Optimize Segments",
"description": "If set to a value above 0, the aggregator job will optimize the resulting Solr collection into this many segments",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"aggregation"
],
"default": "aggregation",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Random Forest Classifier Training (deprecated)",
"description": "Use this job when you have training data and you want to train a random forest model to classify text into groups. Deprecated as of Fusion 5.2.0 and will be removed in a future release; use the Classification job instead.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"trainingLabelField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"advanced",
"code/json",
"lengthy"
]
},
"withIdf": {
"type": "boolean",
"title": "IDF Weighting",
"description": "Weight vector components based on inverse document frequency",
"default": true,
"hints": [
"advanced"
]
},
"w2vDimension": {
"type": "integer",
"title": "Word2Vec Dimension",
"description": "Word-vector dimensionality to represent text (choose > 0 to use)",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec Window Size",
"description": "The window size (context words from [-window, window]) for word2vec",
"default": 5,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxSentenceLength": {
"type": "integer",
"title": "Max Word2Vec Sentence Length",
"description": "Sets the maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength` size.",
"default": 1000,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxIter": {
"type": "integer",
"title": "Max Word2Vec Iterations",
"description": "Maximum number of iterations of the word2vec training",
"default": 1,
"hints": [
"advanced"
]
},
"w2vStepSize": {
"type": "number",
"title": "Word2Vec Step Size",
"description": "Training parameter for word2vec convergence (change at your own peril)",
"default": 0.025,
"hints": [
"advanced"
],
"minimum": 0.005,
"exclusiveMinimum": false
},
"minDF": {
"type": "number",
"title": "Minimum Term Document Frequency",
"description": "To be kept, terms must occur in at least this number of documents (if > 1.0), or at least this fraction of documents (if <= 1.0)",
"default": 0,
"hints": [
"advanced"
]
},
"maxDF": {
"type": "number",
"title": "Max Term Document Frequency",
"description": "To be kept, terms must occur in no more than this number of documents (if > 1.0), or no more than this fraction of documents (if <= 1.0)",
"default": 1,
"hints": [
"advanced"
]
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"predictedLabelField": {
"type": "string",
"title": "Predicted Label Field",
"description": "Solr field which will contain labels when classifier is applied to documents",
"default": "labelPredictedByFusionModel",
"hints": [
"advanced"
]
},
"serializeAsMleap": {
"type": "boolean",
"title": "Serialize as Mleap Bundle",
"description": "Serialize the output model as Mleap Bundle",
"default": true,
"hints": [
"hidden"
]
},
"minSparkPartitions": {
"type": "integer",
"title": "Minimum Number of Spark Partitions",
"description": "Minimum number of Spark partitions for training job.",
"default": 200,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"overwriteExistingModel": {
"type": "boolean",
"title": "Overwrite existing model",
"description": "If a model exists in the model store, overwrite when this job runs",
"default": true,
"hints": [
"advanced"
]
},
"trainingLabelField": {
"type": "string",
"title": "Label Field",
"description": "Solr field containing labels for training instances (should be single-valued strings)"
},
"gridSearch": {
"type": "boolean",
"title": "Grid Search with Cross Validation",
"description": "Perform grid search to optimize hyperparameters",
"default": false
},
"evaluationMetricType": {
"type": "string",
"title": "Evaluation Metric Type",
"description": "Optimize hyperparameter search over one of [binary, multiclass, regression] metrics, or 'none'",
"enum": [
"binary",
"multiclass",
"regression",
"none"
],
"default": "none",
"hints": [
"advanced"
]
},
"autoBalanceClasses": {
"type": "boolean",
"title": "Auto-balance training classes",
"description": "Ensure that all classes of training data have the same size",
"default": true,
"hints": [
"advanced"
]
},
"minTrainingSamplesPerClass": {
"type": "integer",
"title": "Minimum Labeled Class Size",
"description": "Ensure that all classes of training data have at least this many examples",
"default": 100,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"makeOtherClass": {
"type": "boolean",
"title": "Make 'Other' Class",
"description": "Create a label class 'Other' which contains all examples not in a class large enough to train on",
"default": true,
"hints": [
"advanced"
]
},
"otherClassName": {
"type": "string",
"title": "'Other' class name",
"description": "Label class name for the catch-all 'Other' class",
"default": "Other",
"hints": [
"advanced"
],
"minLength": 1
},
"maxDepth": {
"type": "integer",
"title": "Maximum tree depth",
"description": "Maximum depth of the tree (>= 0). E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.",
"default": 5,
"maximum": 20,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false
},
"maxBins": {
"type": "integer",
"title": "Maximum number of discretizing bins",
"description": "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.",
"default": 32,
"maximum": 128,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"numTrees": {
"type": "integer",
"title": "Number of trees",
"description": "Number of trees to train (>= 1)",
"default": 20,
"maximum": 1000,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"random_forests_classifier"
],
"default": "random_forests_classifier",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"predictedLabelField",
"trainingLabelField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"w2vDimension",
"w2vWindowSize",
"w2vMaxIter",
"w2vMaxSentenceLength",
"w2vStepSize",
"withIdf",
"maxDF",
"minDF",
"norm",
"autoBalanceClasses",
"evaluationMetricType",
"minTrainingSamplesPerClass",
"otherClassName",
"makeOtherClass",
"gridSearch",
"maxBins",
"numTrees",
"maxDepth"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Create Collections in Milvus (deprecated)",
"description": "Creates collections with specified parameters in Milvus",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"collections-list": {
"type": "array",
"title": "Collections",
"description": "List of the collections that should be created with corresponding params.",
"items": {
"type": "object",
"required": [
"milvusCollectionName",
"dimension",
"indexFileSize",
"metric"
],
"properties": {
"milvusCollectionName": {
"type": "string",
"title": "Collection Name",
"description": "Name of the collection to create in Milvus",
"pattern": "^[a-zA-Z0-9_]+$"
},
"dimension": {
"type": "integer",
"title": "Dimension",
"description": "Dimension size of vectors to be stored in the collection"
},
"indexFileSize": {
"type": "integer",
"title": "Index File Size",
"description": "Files larger than this will trigger index building for raw data files",
"default": 1024,
"minimum": 1,
"exclusiveMinimum": false
},
"metric": {
"type": "string",
"title": "Metric",
"description": "Metric which should be used for vectors similarity",
"enum": [
"Euclidean",
"Inner Product",
"Hamming",
"Jaccard",
"Tanimoto",
"Substructure",
"Superstructure"
],
"default": "Inner Product"
}
}
}
},
"allow-recreate": {
"type": "boolean",
"title": "Override collections",
"description": "If checked and there are existing collections with the same names, they will be dropped and recreated. If unchecked, the exception is thrown in such situation."
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-milvus-create-collections"
],
"default": "argo-milvus-create-collections",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Word2Vec Model Training (deprecated)",
"description": "Trains a shallow neural model, and projects each document onto this vector embedding space. Deprecated as of Fusion 5.2.0 and will be removed in a future release.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"outputCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"advanced",
"code/json",
"lengthy"
]
},
"withIdf": {
"type": "boolean",
"title": "IDF Weighting",
"description": "Weight vector components based on inverse document frequency",
"default": true,
"hints": [
"advanced"
]
},
"w2vDimension": {
"type": "integer",
"title": "Embedding Dimension",
"description": "Word-vector dimensionality to represent text",
"default": 50,
"hints": [
"dummy"
],
"minimum": 0,
"exclusiveMinimum": false
},
"w2vWindowSize": {
"type": "integer",
"title": "Window Size",
"description": "The window size (context words from [-window, window]) for word2vec",
"default": 5,
"hints": [
"dummy"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxSentenceLength": {
"type": "integer",
"title": "Max Sentence Length",
"description": "Sets the maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength` size.",
"default": 1000,
"hints": [
"dummy"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxIter": {
"type": "integer",
"title": "Max Iterations",
"description": "Maximum number of iterations of the word2vec training",
"default": 1,
"hints": [
"advanced"
]
},
"w2vStepSize": {
"type": "number",
"title": "Step Size",
"description": "Training parameter for word2vec convergence (change at your own peril)",
"default": 0.025,
"hints": [
"advanced"
],
"minimum": 0.005,
"exclusiveMinimum": false
},
"minDF": {
"type": "number",
"title": "Minimum Term Document Frequency",
"description": "To be kept, terms must occur in at least this number of documents (if > 1.0), or at least this fraction of documents (if <= 1.0)",
"default": 0,
"hints": [
"advanced"
]
},
"maxDF": {
"type": "number",
"title": "Max Term Document Frequency",
"description": "To be kept, terms must occur in no more than this number of documents (if > 1.0), or no more than this fraction of documents (if <= 1.0)",
"default": 1,
"hints": [
"advanced"
]
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"predictedLabelField": {
"type": "string",
"title": "Word2Vec Feature Field",
"description": "Solr field which will contain vector features when the word2vec model is applied to documents",
"default": "w2vFeatures",
"hints": [
"hidden"
]
},
"serializeAsMleap": {
"type": "boolean",
"title": "Serialize as Mleap Bundle",
"description": "Serialize the output model as Mleap Bundle",
"default": true,
"hints": [
"hidden"
]
},
"minSparkPartitions": {
"type": "integer",
"title": "Minimum Number of Spark Partitions",
"description": "Minimum number of Spark partitions for training job.",
"default": 200,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"overwriteExistingModel": {
"type": "boolean",
"title": "Overwrite existing model",
"description": "If a model exists in the model store, overwrite when this job runs",
"default": true,
"hints": [
"advanced"
]
},
"outputField": {
"type": "string",
"title": "Output Field",
"description": "Solr field which will contain terms which the word2vec model considers are related to the input",
"default": "related_terms_txt"
},
"uidField": {
"type": "string",
"title": "ID Field Name",
"description": "Field containing the unique ID for each document",
"minLength": 1
},
"numRelatedTerms": {
"type": "integer",
"title": "Number of Related Words",
"description": "For each collection of input words, find this many word2vec-related words",
"default": 10,
"minimum": 1,
"exclusiveMinimum": false
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"word2vec"
],
"default": "word2vec",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"predictedLabelField",
"uidField",
"outputField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"w2vDimension",
"w2vWindowSize",
"w2vMaxIter",
"w2vMaxSentenceLength",
"w2vStepSize",
"withIdf",
"maxDF",
"minDF",
"norm",
"numRelatedTerms"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Parameterized SQL Aggregation",
"description": "A SQL aggregation job where users provide parameters to be injected into a built-in SQL template at runtime.",
"required": [
"id",
"inputCollection",
"dataFormat",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"inputCollection": {
"type": "string",
"title": "Source Collection",
"description": "Collection containing documents to be aggregated."
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "The collection to write the aggregates to on output. Defaults to the input collection if not specified."
},
"notes": {
"type": "string",
"title": "Notes",
"description": "A short description about this job.",
"hints": [
"lengthy"
]
},
"parameters": {
"type": "array",
"title": "SQL Parameters",
"description": "Parameters bound on the SQL template at runtime.",
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"selectQuery": {
"type": "string",
"title": "Query",
"description": "The query to select the desired signals. If not set then '*:*' will be used, or equivalent.",
"default": "*:*",
"hints": [
"advanced"
]
},
"timeRange": {
"type": "string",
"title": "Time Range",
"description": "The time range to select signals on.",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceCatchup": {
"type": "boolean",
"title": "Aggregate New and Merge with Existing",
"description": "If checked, only aggregate new signals created since the last time the job was successfully run. If there is a record of such previous run then this overrides the starting time of time range set in 'timeRange' property. If unchecked, then all matching signals are aggregated and any previously aggregated docs are deleted to avoid double counting.",
"default": true,
"hints": [
"advanced"
]
},
"sql": {
"type": "string",
"title": "SQL",
"description": "Use SQL to perform the aggregation. You do not need to include a time range filter in the WHERE clause as it gets applied automatically before executing the SQL statement.",
"hints": [
"advanced",
"code/sql"
],
"minLength": 1
},
"rollupSql": {
"type": "string",
"title": "Rollup SQL",
"description": "Use SQL to perform a rollup of previously aggregated docs. If left blank, the aggregation framework will supply a default SQL query to rollup aggregated metrics.",
"hints": [
"advanced",
"code/sql"
],
"minLength": 1
},
"sourceRemove": {
"type": "boolean",
"title": "Remove Source",
"description": "If true, the processed source signals will be removed after aggregation. Default is false.",
"default": false,
"hints": [
"advanced"
]
},
"referenceTime": {
"type": "string",
"title": "Reference Time",
"description": "Timestamp to use for computing decays and to determine the value of NOW.",
"hints": [
"advanced"
],
"format": "date-time"
},
"hiddenParameters": {
"type": "array",
"title": "Hidden Parameters",
"description": "Additional settings used to tune the underlying aggregation job.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Additional configuration settings to fine-tune how input records are read for this aggregation.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"skipCheckEnabled": {
"type": "boolean",
"title": "Job Skip Check Enabled?",
"description": "If the catch-up flag is enabled and this field is checked, the job framework will execute a fast Solr query to determine if this run can be skipped.",
"default": true,
"hints": [
"advanced"
]
},
"useNaturalKey": {
"type": "boolean",
"title": "Use Natural Key?",
"description": "Use a natural key provided in the raw signals data for aggregation, rather than relying on Solr UUIDs. Migrated aggregations jobs from Fusion 4 will need this set to false.",
"default": true,
"hints": [
"advanced"
]
},
"optimizeSegments": {
"type": "integer",
"title": "Optimize Segments",
"description": "If set to a value above 0, the aggregator job will optimize the resulting Solr collection into this many segments",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"sql_template"
],
"default": "sql_template",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Token and Phrase Spell Correction (Deprecated)",
"description": "Use this job to compute token and phrase level spell correction which you can use in your synonym list. This job is deprecated.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input Collection",
"description": "Collection containing search strings and event counts. Should ideally be the signals collection.If an aggregation collection is being used, update the filter query in the advanced options",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Input Field",
"description": "Field containing search strings.",
"default": "query",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Data filter query",
"description": "Solr query to use when loading training data if using Solr (e.g. type:click OR type:response), Spark SQL expression for all other data sources",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Collection to store misspelling and correction pairs. Defaults to the query_rewrite_staging collection for the application.",
"hints": [
"dummy"
]
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"stopwordsBlobName": {
"type": "string",
"title": "Stopwords blob (Deprecated)",
"description": "Name of stopwords blob resource (.txt or .rtf file uploaded to the blob store). This field is marked for deprecation. Going forward, please specify the stopwords blob name as a luceneSchema property.",
"hints": [
"advanced"
],
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
},
"dictionaryCollection": {
"type": "string",
"title": "Dictionary Collection",
"description": "Solr Collection containing dictionary with correct spellings. E.g., product catalog."
},
"dictionaryField": {
"type": "string",
"title": "Dictionary Field",
"description": "Solr field containing dictionary text. Multiple fields can be specified using the format: field1,field2 etc."
},
"countField": {
"type": "string",
"title": "Count Field",
"description": "Solr field containing query count",
"default": "count_i"
},
"mainType": {
"type": "string",
"title": "Main Event Type",
"description": "The main signal event type (e.g. click) that the job is based on if input is signal data. E.g., if main type is click, then head and tail tokens/phrases are defined by the number of clicks.",
"default": "click"
},
"filterType": {
"type": "string",
"title": "Filtering Event Type",
"description": "The secondary event type (e.g. response) that can be used for filtering out rare searches.Note: In order to use this `response` default value, please make sure you have type:response in the input collection.If there is no need to filter on number of searches, please leave this parameter blank.",
"default": "response"
},
"signalTypeField": {
"type": "string",
"title": "Field Name of Signal Type",
"description": "The field name of signal type in the input collection.",
"default": "type",
"hints": [
"advanced"
]
},
"minCountMain": {
"type": "integer",
"title": "Minimum Main Event Count",
"description": "Minimum number of main events (e.g. clicks after aggregation) necessary for the query to be considered. The job will only analyze queries with clicks greater or equal to this number.",
"default": 1
},
"minCountFilter": {
"type": "integer",
"title": "Minimum Filtering Event Count",
"description": "Minimum number of filtering events (e.g. searches after aggregation) necessary for the query to be considered. The job will only analyze queries that were issued greater or equal to this number of times.",
"default": 10
},
"dictionaryDataFilterQuery": {
"type": "string",
"title": "Dictionary Data Filter Query",
"description": "Solr query to use when loading dictionary data",
"default": "*:*",
"hints": [
"advanced"
]
},
"minPrefix": {
"type": "integer",
"title": "Minimum Prefix Match",
"description": "The minimum number of matches on starting characters. Note: Setting it to 0 may largely increase running time. ",
"default": 1,
"minimum": 0,
"exclusiveMinimum": false
},
"minMispellingLen": {
"type": "integer",
"title": "Minimum Length of Misspelling",
"description": "The minimum length of misspelling to check. Smaller number may lead to problematic corrections. E.g., It is hard to find the right correction for a two or three character string. ",
"default": 5,
"minimum": 1,
"exclusiveMinimum": false
},
"maxDistance": {
"type": "integer",
"title": "Maximum Edit Distance",
"description": "The maximum edit distance between related token/phrases you are interested in. Large number leads to longer correction list but may add lower quality corrections. ",
"default": 2,
"minimum": 1,
"exclusiveMinimum": false
},
"lastCharMatchBoost": {
"type": "number",
"title": "Last Character Match Boost",
"description": "When there are multiple possible corrections, we rank corrections based on: editDistBoost / editDist + correctionCountBoost * log(correctionCount) + lastCharMatchBoost * lastCharMatch + soundMatchBoost * soundexMatch. Big number puts more weight on last character match between misspelling and correction strings",
"default": 1,
"hints": [
"advanced"
]
},
"soundMatchBoost": {
"type": "number",
"title": "Sound Match Boost",
"description": "When there are multiple possible corrections, we rank corrections based on: editDistBoost / editDist + correctionCountBoost * log(correctionCount) + lastCharMatchBoost * lastCharMatch + soundMatchBoost * soundexMatch. Big number puts more weight on soundex match between misspelling and correction strings",
"default": 3,
"hints": [
"advanced"
]
},
"correctCntBoost": {
"type": "number",
"title": "Correction Count Boost",
"description": "When there are multiple possible corrections, we rank corrections based on: editDistBoost / editDist + correctionCountBoost * log(correctionCount) + lastCharMatchBoost * lastCharMatch + soundMatchBoost * soundexMatch. Big number puts more weight on count of correction string occurrences.",
"default": 2,
"hints": [
"advanced"
]
},
"editDistBoost": {
"type": "number",
"title": "Edit Distance Boost",
"description": "When there are multiple possible corrections, we rank corrections based on: editDistBoost / editDist + correctionCountBoost * log(correctionCount) + lastCharMatchBoost * lastCharMatch + soundMatchBoost * soundexMatch. Big number puts more weight on shorter edit distance.",
"default": 2,
"hints": [
"advanced"
]
},
"signalDataIndicator": {
"type": "boolean",
"title": "Input is Signal Data",
"description": "The input dataset that the spell checker based on is signal data. If the input data is content document rather than signal, please uncheck.",
"default": true
},
"analyzerConfigQuery": {
"type": "string",
"title": "Lucene Analyzer Schema for Processing Queries",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [ { \"name\": \"LetterTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"letter\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"LetterTokLowerStem\" } ]}",
"hints": [
"lengthy",
"code/json"
],
"minLength": 1
},
"analyzerConfigDictionary": {
"type": "string",
"title": "Lucene Analyzer Schema for Processing Dictionary",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [ { \"name\": \"LetterTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"letter\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"LetterTokLowerStem\" } ]}",
"hints": [
"lengthy",
"code/json"
],
"minLength": 1
},
"correctionThreshold": {
"type": "number",
"title": "Correct Spelling Threshold",
"description": "The count of occurrence ABOVE which the token/phrases are likely to be corrected spellings. Note that this number can be either fraction (<1.0) to denote a quantile based on count number distribution (shown in the log) or a number (>1.0) to denote the absolute count. A big number may cause performance issues.",
"default": 0.8,
"hints": [
"advanced"
]
},
"misspellingThreshold": {
"type": "number",
"title": "Misspelling Threshold",
"description": "The count of occurrence BELOW which the token/phrases are likely to be misspellings. Note that this number can be either fraction (<1.0) to denote a quantile based on count number distribution (shown in the log) or a number (>1.0) to denote the absolute count.",
"default": 0.8,
"hints": [
"advanced"
]
},
"lenScale": {
"type": "integer",
"title": "Edit Dist vs String Length Scale",
"description": "A scaling factor used to normalize the length of query string to compare against edit distances. The filtering is based on if edit_dist <= string_length/length_scale. A large value for this factor leads to a shorter correction list. A small value leads to a longer correction list but may add lower quality corrections.",
"default": 5,
"hints": [
"advanced"
]
},
"corMisRatio": {
"type": "number",
"title": "Correction and Misspelling Count Ratio",
"description": "Ratio between correction occurrence count and misspelling occurrence count. Pairs with ratio less than or equal to this number will be filtered. Big number leads to shorter correction list and may have higher quality corrections.",
"default": 3,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"enableAutoPublish": {
"type": "boolean",
"title": "Enable auto-publishing",
"description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
"default": false,
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"tokenPhraseSpellCorrection"
],
"default": "tokenPhraseSpellCorrection",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed",
"signalDataIndicator"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"signalTypeField",
"mainType",
"filterType",
"countField"
]
},
{
"label": "Boost Parameters",
"properties": [
"lastCharMatchBoost",
"soundMatchBoost",
"correctCntBoost",
"editDistBoost"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"minCountMain",
"minCountFilter",
"correctionThreshold",
"misspellingThreshold",
"lenScale",
"corMisRatio",
"maxDistance",
"minMispellingLen",
"minPrefix"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfigQuery"
]
},
{
"label": "Misc. Parameters",
"properties": [
"stopwordsBlobName",
"dictionaryCollection",
"dictionaryField",
"dictionaryDataFilterQuery",
"analyzerConfigDictionary"
]
}
]
},
{
"type": "object",
"title": "SQL-Based Experiment Metric",
"description": "This job is created by an experiment in order to calculate an objective",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"experimentId": {
"type": "string",
"title": "Experiment ID",
"hints": [
"readonly"
]
},
"metricName": {
"type": "string",
"title": "Objective name",
"hints": [
"readonly"
]
},
"notes": {
"type": "string",
"title": "Notes",
"description": "A short description about this job.",
"hints": [
"lengthy"
]
},
"sql": {
"type": "string",
"title": "SQL",
"hints": [
"readonly"
]
},
"experiment": {
"type": "object",
"title": "Experiment",
"required": [
"id",
"baseSignalsCollection",
"metrics"
],
"properties": {
"id": {
"type": "string",
"title": "ID",
"maxLength": 128,
"pattern": "^[A-Za-z0-9_\\-]+$"
},
"description": {
"type": "string",
"title": "Description"
},
"uniqueIdParameter": {
"type": "string",
"title": "Unique ID Parameter",
"description": "The name of the request parameter containing the user ID",
"default": "userId"
},
"baseSignalsCollection": {
"type": "string",
"title": "Base Collection for Signals",
"description": "Signals resulting from requests that flow through this experiment will go into the signal collection associated with this collection",
"minLength": 1,
"pattern": "^[A-Za-z0-9_\\-]+$"
},
"variants": {
"type": "array",
"title": "Variants",
"description": "Specify what varies in this variant, and optionally change the traffic weight",
"items": {
"type": "object",
"properties": {
"id": {
"type": "string",
"title": "Variant id",
"hints": [
"hidden"
],
"maxLength": 128,
"pattern": "^[A-Za-z0-9_\\-]+$"
},
"name": {
"type": "string",
"title": "Name"
},
"queryPipeline": {
"type": "string",
"title": "Query Pipeline"
},
"params": {
"type": "array",
"title": "Query Params",
"description": "URL parameters to add to queries using this variant",
"items": {
"type": "object",
"required": [
"key",
"value"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
},
"policy": {
"type": "string",
"title": "Update Policy",
"enum": [
"replace",
"append",
"remove",
"default"
],
"default": "append"
}
}
}
},
"collection": {
"type": "string",
"title": "Collection"
},
"weight": {
"type": "number",
"title": "Weight",
"description": "Proportion of traffic to send to this variant. Higher values mean proportionally more traffic will be routed to this variant",
"default": 1,
"minimum": 0.01,
"exclusiveMinimum": false
}
}
}
},
"metrics": {
"type": "array",
"title": "Objectives",
"description": "Metrics that will be used to evaluate the variants",
"minItems": 1,
"items": {
"type": "object",
"required": [
"name"
],
"properties": {
"name": {
"type": "string",
"title": "Name"
},
"description": {
"type": "string",
"title": "Description"
},
"primary": {
"type": "boolean",
"title": "Primary",
"description": "Whether this metric is the primary metric used for evaluating the variants (the 'OEC')."
},
"jobId": {
"type": "string",
"title": "Associated Spark Job ID",
"hints": [
"hidden"
]
},
"binary": {
"type": "boolean",
"title": "Binary-valued metric",
"description": "Whether this metric measures a Bernoulli trial (clicks, cart adds, etc) or a continuous-valued event.",
"hints": [
"hidden"
]
}
}
}
},
"enabled": {
"type": "boolean",
"title": "Enabled",
"default": true,
"hints": [
"readonly"
]
},
"startTimestamp": {
"type": "string",
"title": "Start Date",
"description": "When the experiment last started",
"hints": [
"readonly"
],
"format": "date-time"
},
"runId": {
"type": "string",
"title": "Run Identifier",
"hints": [
"readonly",
"hidden"
]
},
"automaticallyAdjustTraffic": {
"type": "boolean",
"title": "Automatically Adjust Weights Between Variants",
"default": false
}
},
"hints": [
"hidden"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"experiment_sql"
],
"default": "experiment_sql",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Custom Spark Job",
"description": "Use this job when you want to run a custom JAR on Spark",
"required": [
"id",
"klassName",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"klassName": {
"type": "string",
"title": "Class name",
"description": "Fully-qualified name of the Java/Scala class to invoke"
},
"submitArgs": {
"type": "array",
"title": "Script args",
"description": "Additional options to pass to the application when running this job.",
"items": {
"type": "string"
}
},
"script": {
"type": "string",
"title": "Scala Script",
"description": "Use this text field if you want to override the default behaviour, which is to run className.main(args)",
"hints": [
"code/scala",
"lengthy",
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"custom_spark_scala_job"
],
"default": "custom_spark_scala_job",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Delete Indexes in Milvus (deprecated)",
"description": "Deletes specified indexes in Milvus collections",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"collections": {
"type": "array",
"title": "Collections",
"description": "List of collections in Milvus where indexes should be dropped.",
"items": {
"type": "string"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-milvus-delete-indexes"
],
"default": "argo-milvus-delete-indexes",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Document Clustering",
"description": "Use this job when you want to cluster a set of documents and attach cluster labels based on topics.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"outputCollection",
"uidField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing documents to be clustered",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to",
"minLength": 1
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"uidField": {
"type": "string",
"title": "ID Field Name",
"description": " Field containing the unique ID for each document.",
"default": "id",
"minLength": 1
},
"clusterIdField": {
"type": "string",
"title": "Output Field Name for Cluster Id",
"description": "Output field name for unique cluster id.",
"default": "cluster_id"
},
"clusterLabelField": {
"type": "string",
"title": "Detected Cluster Keywords Field Name",
"description": "Output field name for top frequent terms that are (mostly) unique for each cluster.",
"default": "cluster_label"
},
"freqTermField": {
"type": "string",
"title": "Top Frequent Terms Field Name",
"description": "Output field name for top frequent terms in each cluster. These may overlap with other clusters.",
"default": "freq_terms"
},
"distToCenterField": {
"type": "string",
"title": "Output Field Name for doc distance to its cluster center",
"description": "Output field name for doc distance to its corresponding cluster center (measure how representative the doc is).",
"default": "dist_to_center"
},
"minDF": {
"type": "number",
"title": "Min Doc Support",
"description": "Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 5
},
"maxDF": {
"type": "number",
"title": "Max Doc Support",
"description": "Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 0.5
},
"kExact": {
"type": "integer",
"title": "Number of Clusters",
"description": "Exact number of clusters.",
"default": 0
},
"kMax": {
"type": "integer",
"title": "Max Possible Number of Clusters",
"description": "Max possible number of clusters.",
"default": 20
},
"kMin": {
"type": "integer",
"title": "Min Possible Number of Clusters",
"description": "Min possible number of clusters.",
"default": 2
},
"docLenTrim": {
"type": "boolean",
"title": "Find Extreme Length Doc Flag",
"description": " Whether to separate out docs with extreme lengths.",
"default": true
},
"outlierTrim": {
"type": "boolean",
"title": "Find Outliers Flag",
"description": " Whether to perform outlier detection.",
"default": true
},
"shortLen": {
"type": "number",
"title": "Length Threshold for Short Doc",
"description": "Length threshold to define short document. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
"default": 5
},
"longLen": {
"type": "number",
"title": "Length Threshold for Long Doc",
"description": "Length threshold to define long document. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
"default": 0.99
},
"numKeywordsPerLabel": {
"type": "integer",
"title": "Number of Keywords for Each Cluster",
"description": "Number of Keywords needed for labeling each cluster.",
"default": 5
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"w2vDimension": {
"type": "integer",
"title": "Word2Vec Dimension",
"description": "Word-vector dimensionality to represent text (choose > 0 to use, suggested dimension ranges: 100~150)",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec Window Size",
"description": "The window size (context words from [-window, window]) for word2vec",
"default": 8,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"patternreplace\", \"pattern\": \"^[\\\\d.]+$\", \"replacement\": \" \", \"replace\": \"all\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"code/json",
"advanced",
"lengthy"
],
"minLength": 1
},
"clusteringMethod": {
"type": "string",
"title": "Clustering Method (hierarchical or kmeans)",
"description": "Choose between hierarchical vs kmeans clustering.",
"default": "hierarchical",
"hints": [
"advanced"
]
},
"outlierK": {
"type": "integer",
"title": "Number of outlier groups",
"description": "Number of clusters to help find outliers.",
"default": 10,
"hints": [
"advanced"
]
},
"outlierThreshold": {
"type": "number",
"title": "Outlier cutoff",
"description": "Identify as outlier group if less than this percent of total documents. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
"default": 0.01,
"hints": [
"advanced"
]
},
"minDivisibleSize": {
"type": "number",
"title": "Minimum divisible cluster size",
"description": "Clusters must have at least this many documents to be split further. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
"default": 0,
"hints": [
"advanced"
]
},
"kDiscount": {
"type": "number",
"title": "Discount for K when choosing number of clusters",
"description": "Applies a discount to help favor large/small K (number of clusters). A smaller value pushes K to assume a higher value within the [min, max] K range.",
"default": 0.7,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"doc_clustering"
],
"default": "doc_clustering",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"uidField",
"clusterIdField",
"freqTermField",
"clusterLabelField",
"distToCenterField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"clusteringMethod",
"outlierTrim",
"outlierK",
"outlierThreshold",
"kExact",
"kMax",
"kMin",
"w2vDimension",
"w2vWindowSize",
"maxDF",
"minDF",
"norm",
"numKeywordsPerLabel",
"minDivisibleSize",
"kDiscount"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig",
"docLenTrim",
"longLen",
"shortLen"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Smart Answers Supervised Training",
"description": "Trains Smart Answers model on a supervised basis with pre-trained or trained embeddings and deploys the trained model to the ML Model Service",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"questionColName",
"answerColName",
"deployModelName",
"modelReplicas",
"modelBase",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"useAutoML": {
"type": "boolean",
"title": "Perform auto hyperparameter tuning",
"description": "Automatically tune hyperparameters (will take longer to train). Transformer models aren't used in this regime",
"default": false
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
"hints": [
"code/sql",
"advanced"
]
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"questionColName": {
"type": "string",
"title": "Question Field",
"description": "Name of the field containing questions",
"minLength": 1
},
"answerColName": {
"type": "string",
"title": "Answer Field",
"description": "Name of the field containing answers",
"minLength": 1
},
"weightColName": {
"type": "string",
"title": "Weight Field",
"description": "Name of the field to be used for weights",
"minLength": 1
},
"deployModelName": {
"type": "string",
"title": "Model Deployment Name",
"description": "Name of the model to be used for deployment (must be a valid lowercased DNS subdomain with no underscores)",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"testMode": {
"type": "boolean",
"title": "Test Mode",
"description": "If set to true, then the training will exit after the first iteration. Useful for ensuring that the end-to-end pipeline is working",
"default": false,
"hints": [
"hidden"
]
},
"modelReplicas": {
"type": "integer",
"title": "Model replicas",
"description": "How many replicas of the model should be deployed by Seldon Core",
"default": 1
},
"modelBase": {
"type": "string",
"title": "Model base",
"description": "Specify one of these custom embeddings: ['word_custom', 'bpe_custom'] or choose one of the included pre-trained embeddings / models.",
"enum": [
"word_custom",
"bpe_custom",
"word_en_300d_2M",
"bpe_en_300d_10K",
"bpe_en_300d_200K",
"bpe_ja_300d_100K",
"bpe_ko_300d_100K",
"bpe_zh_300d_50K",
"bpe_multi_300d_320K",
"distilbert_en",
"distilbert_multi",
"biobert_v1.1"
],
"default": "word_en_300d_2M"
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "The proportion of data to be sampled from the full dataset. Use a value between 0 and 1 for a proportion (e.g. 0.5 for 50%), or for a specific number of examples, use an integer larger than 1. Leave blank for no sampling",
"hints": [
"advanced"
]
},
"seed": {
"type": "integer",
"title": "Seed",
"description": "Random seed for sampling",
"default": 12345,
"hints": [
"hidden"
]
},
"minTokensNum": {
"type": "integer",
"title": "Minimum number of words in doc",
"description": "Drop document if the total words is lower than this value",
"default": 1,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"maxTokensNum": {
"type": "integer",
"title": "Maximum number of words in doc",
"description": "Drop document if the total words is greater than this value",
"default": 5000,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"lowerCases": {
"type": "boolean",
"title": "Lower case all words",
"description": "Whether to lower case all words in training, i.e. whether to treat upper case and lower case words equally. Only utilized for custom embeddings or for the default model base: word_en_300d_2M.",
"default": false
},
"maxVocabSize": {
"type": "integer",
"title": "Maximum vocabulary size",
"description": "Maximum number of words in vocabulary, words will be trimmed if frequency is too low. Only utilized for custom embeddings or for the default model base: word_en_300d_2M.",
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"w2vEpochs": {
"type": "integer",
"title": "Word2Vec training epochs",
"description": "Number of epochs to train custom word2vec embeddings",
"default": 15,
"hints": [
"advanced"
]
},
"w2vTextsCollection": {
"type": "string",
"title": "Texts data path",
"description": "Solr collection or cloud storage path which contains extra documents that will be used to get better vocabulary coverage as well as to train custom word embeddings if custom Model Base is specified."
},
"w2vTextColumns": {
"type": "string",
"title": "Texts collection fields",
"description": "Which fields in the text collection to use. If multiple fields, please separate them by comma, e.g. description_t,title_t."
},
"textsFormat": {
"type": "string",
"title": "Texts format",
"description": "The format of the texts training data - solr, parquet etc."
},
"w2vVectorSize": {
"type": "integer",
"title": "Size of word vectors",
"description": "Word-vector dimensionality to represent text (suggested dimension ranges: 100~300)",
"default": 150,
"hints": [
"advanced"
]
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec window size",
"description": "The window size (context words from [-window, window]) for Word2Vec",
"default": 8,
"hints": [
"advanced"
]
},
"valSize": {
"type": "number",
"title": "Validation sample size",
"description": "Proportion of the unique questions that should be used as validation samples. When val_size > 1, then that specific number of unique questions will be sampled rather than a proportion.",
"default": 0.1,
"minimum": 0.001,
"exclusiveMinimum": false
},
"maxLen": {
"type": "integer",
"title": "Max length",
"description": "Maximum length of text processed by the model. Texts longer than this value will be trimmed. This parameter is especially important for Transformer-based models as it affects training and inference time. Note that the maximum supported length for Transformer models is 512, so you can specify any value up to that. The default value is the max value between three times the STD of question lengths and two times the STD of answer lengths.",
"hints": [
"advanced"
]
},
"embSPDP": {
"type": "number",
"title": "Dropout ratio",
"description": "Fraction of input to drop with Dropout layer (from 0-1)",
"default": 0.3
},
"trainBatch": {
"type": "integer",
"title": "Training batch size",
"description": "Batch size during training. If left blank, this will be set automatically based on the input data"
},
"infBatch": {
"type": "integer",
"title": "Inference batch size used in validation",
"description": "Batch size during validation. If left blank, this will be set automatically based on the input data",
"hints": [
"advanced"
]
},
"rnnNamesList": {
"type": "string",
"title": "RNN function list",
"description": "List of layers of RNNs to be used, with possible values of lstm, gru. E.g. [\"lstm\", \"lstm\"]. This value will be automatically decided based on data if left blank"
},
"rnnUnitsList": {
"type": "string",
"title": "RNN function units list",
"description": "List of RNN layer units numbers, corresponding to RNN function list. E.g. 150, 150. This value will be automatically decided based on data if left blank"
},
"epochs": {
"type": "integer",
"title": "Number of epochs to be used in training"
},
"weightDecay": {
"type": "number",
"title": "Weight decay",
"description": "L2 penalty used in AdamW optimizer. Bigger values will provide stronger regularization. Default values are 0.0003 for RNN models and 0.01 for Transformer models."
},
"monitorPatience": {
"type": "integer",
"title": "Monitor patience",
"description": "Stop training if no improvement in metrics by this number of epochs"
},
"baseLR": {
"type": "number",
"title": "Base learning rate",
"description": "Base learning rate that should be used during training. Reasonable values are from 0.0001 to 0.003 depending on model base. It's better to use lower LR with Transformer models."
},
"minLR": {
"type": "number",
"title": "Minimum learning rate",
"description": "Minimum learning rate used during training. Reasonable values are from 0.00001 to 0.00003.",
"hints": [
"advanced"
]
},
"numWarmUpEpochs": {
"type": "integer",
"title": "Number of warm-up epochs",
"description": "Number of epochs used for the warm-up stage for learning rates. Reasonable values are from 0-4 epochs, usually 1-2 are used."
},
"numFlatEpochs": {
"type": "integer",
"title": "Number of flat epochs",
"description": "Number of epochs used in flat stage for learning rates. Reasonable value would be one-half of the epochs, so the other half will be with Cosine Annealing learning rate."
},
"extraTrainingArgs": {
"type": "string",
"title": "Extra training args for Python scripts",
"description": "Add any additional arguments for the Python training scripts in this field",
"hints": [
"hidden"
]
},
"monitorMetric": {
"type": "string",
"title": "Monitor metric",
"description": "The main metric at k that should be monitored to decide when to stop training. Possible metrics are: [\"map\", \"mrr\", \"recall\", \"precision\"]",
"default": "mrr@3"
},
"monitorMetricsList": {
"type": "string",
"title": "Metrics list",
"description": "List of evaluation metrics on validation data that will be printed in the log at the end of each epoch. Possible metrics are: [\"map\", \"mrr\", \"recall\", \"precision\"]",
"default": "[\"map\", \"mrr\", \"recall\"]"
},
"kList": {
"type": "string",
"title": "Metrics@k list",
"description": "The k retrieval position that will be used to compute for each metric",
"default": "[1,3,5]"
},
"numClusters": {
"type": "integer",
"title": "Number of clusters",
"description": "DEPRECATED: please, consider using Milvus for fast dense vector similarity search. Number of clusters to be used for fast dense vector retrieval. Note no clustering will be applied if this is set to 0. If left blank, cluster count will be inferred by the job depending on the data",
"default": 0,
"hints": [
"advanced"
]
},
"topKClusters": {
"type": "integer",
"title": "Top k of clusters to return",
"description": "How many closest clusters the model can find for each query. At retrieval time, all answers in top k nearest clusters will be returned and reranked",
"default": 10,
"hints": [
"advanced"
]
},
"unidecode": {
"type": "boolean",
"title": "Apply unicode decoding",
"description": "Use Unidecode library to transform Unicode input into ASCII transliterations. Only utilized for custom embeddings or for the default model base: word_en_300d_2M",
"default": true
},
"useMixedPrecision": {
"type": "string",
"title": "Use Mixed Precision",
"description": "Check this option to train a model with mixed precision support.This will only work if the node has a GPU. You'll only see a speed up on newer NVidia GPUs (Turing and later) with Transformer models.",
"enum": [
"auto",
"true",
"false"
],
"default": "auto",
"hints": [
"advanced"
]
},
"useLabelingResolution": {
"type": "boolean",
"title": "Use Labeling Resolution",
"description": "Check this to determine similar questions and similar answers via labeling resolution and graph connected components. Does not work well with noisy data like eCommerce queries. But helps with FAQ / QnA data.",
"default": false
},
"useLayerNorm": {
"type": "boolean",
"title": "Use Layer Norm",
"description": "Check this to use layer norm for pooling.",
"default": false,
"hints": [
"advanced"
]
},
"globalPoolType": {
"type": "string",
"title": "Global Pool Type",
"description": "Determines how token vectors should be aggregated to obtain final content vector. Must be one of: [avg, max, self_attention].",
"enum": [
"avg",
"max",
"self_attention"
],
"default": "self_attention",
"hints": [
"advanced"
]
},
"embTrainable": {
"type": "boolean",
"title": "Fine-tune Token Embeddings",
"description": "Choose this to fine-tune token embeddings during model training. Tends to work well with eCommerce data.",
"default": false,
"hints": [
"advanced"
]
},
"eps": {
"type": "number",
"title": "Eps",
"description": "Epsilon is the AdamW optimizer. By default 1e-8 is used for RNN models and 1e-6 is used for Transformer models.",
"hints": [
"advanced"
]
},
"maxGradNorm": {
"type": "number",
"title": "Max Grad Norm",
"description": "Max norm used for gradients clipping. By default it’s not used for RNN models but 1.0 value is used for Transformer models.",
"hints": [
"advanced"
]
},
"useXbm": {
"type": "string",
"title": "Use Cross-batch memory",
"description": "Stores encoded representations of previous batches in memory for better negative examples sampling. Works well for Transformer models. Leave this at 'auto' to let the training module determine this.",
"enum": [
"auto",
"true",
"false"
],
"default": "auto",
"hints": [
"advanced"
]
},
"xbmMemorySize": {
"type": "integer",
"title": "Cross-batch memory size",
"description": "Number of examples from the previous batches that are stored in memory. The default size for Transformer models is 256.",
"hints": [
"advanced"
]
},
"xbmEpochActivation": {
"type": "integer",
"title": "Cross-batch epoch activation",
"description": "After which epoch cross-batch memory should be activated. By default it’s activated after the first epoch for Transformer models.",
"hints": [
"advanced"
]
},
"evalAnnIndex": {
"type": "string",
"title": "Eval ANN index",
"description": "Choose this to use Approximate Nearest Neighbor search during evaluation. For big datasets it can speed up the evaluation time with minimum loss in accuracy, for small datasets it will most likely make it slower.",
"enum": [
"auto",
"true",
"false"
],
"default": "auto",
"hints": [
"advanced"
]
},
"distance": {
"type": "string",
"title": "Distance",
"description": "Vectors distance/similarity that should be used during training and in the pipelines. Choose one of: ['cosine_similarity', 'dot_product_similarity', 'euclidean_distance'].",
"enum": [
"cosine_similarity",
"dot_product_similarity",
"euclidean_distance"
],
"default": "cosine_similarity",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-qna-supervised"
],
"default": "argo-qna-supervised",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"trainingDataFilterQuery",
"seed",
"trainingSampleFraction",
"questionColName",
"answerColName",
"weightColName",
"w2vTextsCollection",
"w2vTextColumns",
"textsFormat",
"deployModelName",
"modelReplicas",
"secretName"
]
},
{
"label": "Data Preprocessing",
"properties": [
"useLabelingResolution",
"unidecode",
"lowerCases",
"minTokensNum",
"maxTokensNum",
"maxVocabSize"
]
},
{
"label": "Custom Embeddings Initialization",
"properties": [
"w2vEpochs",
"w2vVectorSize",
"w2vWindowSize"
]
},
{
"label": "Evaluation Parameters",
"properties": [
"valSize",
"monitorMetric",
"monitorPatience",
"monitorMetricsList",
"kList",
"evalAnnIndex"
]
},
{
"label": "General Encoder Parameters",
"properties": [
"embTrainable",
"maxLen",
"globalPoolType",
"useLayerNorm",
"numClusters",
"topKClusters"
]
},
{
"label": "RNN Encoder Parameters",
"properties": [
"embSPDP",
"rnnNamesList",
"rnnUnitsList"
]
},
{
"label": "Training Parameters",
"properties": [
"epochs",
"trainBatch",
"infBatch",
"baseLR",
"numWarmUpEpochs",
"numFlatEpochs",
"minLR",
"weightDecay",
"distance",
"eps",
"maxGradNorm",
"useMixedPrecision",
"useXbm",
"xbmMemorySize",
"xbmEpochActivation"
]
}
]
},
{
"type": "object",
"title": "Phrase Extraction (Deprecated)",
"description": "Use this job when you want to identify statistically significant phrases in your content. This job is deprecated.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"analyzerConfig",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 8180,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store extracted phrases; defaults to the query_rewrite_staging collection for the associated app."
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"ngramSize": {
"type": "integer",
"title": "Ngram Size",
"description": "The number of words in the ngram you want to consider for the sips.",
"default": 3,
"maximum": 5,
"exclusiveMaximum": false,
"minimum": 2,
"exclusiveMinimum": false
},
"minmatch": {
"type": "integer",
"title": "Minimum Count",
"description": "The number of times a phrase must exist to be considered. NOTE: if input is non signal data, please reduce the number to e.g. 5.",
"default": 100,
"minimum": 1,
"exclusiveMinimum": false
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Text Analyzer",
"description": "The style of text analyzer you would like to use.",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"lengthy",
"code/json"
]
},
"attachPhrases": {
"type": "boolean",
"title": "Extract Key Phrases from Input Text",
"description": "Checking this will cause the job to associate extracted phrases from each source doc. and write them back to the output collection. If input data is signals, it is suggested to turn this option off. Also, currently it is not allowed to check this option while attempting to write to a _query_rewrite_staging collection.",
"default": false,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"minLikelihood": {
"type": "number",
"title": "Minimum Likelihood Score",
"description": "Phrases below this threshold will not be written in the output of this job.",
"hints": [
"advanced"
]
},
"enableAutoPublish": {
"type": "boolean",
"title": "Enable auto-publishing",
"description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
"default": false,
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"sip"
],
"default": "sip",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"minmatch",
"ngramSize"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
}
]
},
{
"type": "object",
"title": "Parallel Bulk Loader",
"description": "Use this job when you want to load data into Fusion from a SparkSQL compliant datasource, and send this data to any Spark supported datasource (Solr/Index Pipeline/S3/GCS/...).",
"required": [
"id",
"format",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"format": {
"type": "string",
"title": "Format",
"description": "Specifies the input data source format; common examples include: parquet, json, textinputformat"
},
"path": {
"type": "string",
"title": "Path",
"description": "Path to load; for data sources that support multiple paths, separate by commas"
},
"streaming": {
"type": "object",
"title": "Streaming",
"required": [
"enableStreaming"
],
"properties": {
"enableStreaming": {
"type": "boolean",
"title": "Enable Streaming",
"description": "Stream data from input source to output Solr collection"
},
"outputMode": {
"type": "string",
"title": "Output mode",
"description": "Specifies the output mode for streaming. E.g., append (default), complete, update",
"enum": [
"append",
"complete",
"update"
],
"default": "append"
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options passed to the data source to configure the read operation; options differ for every data source so refer to the documentation for more information.",
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to send the documents loaded from the input data source."
},
"outputIndexPipeline": {
"type": "string",
"title": "Send to Index Pipeline",
"description": "Send the documents loaded from the input data source to an index pipeline instead of going directly to Solr."
},
"outputParser": {
"type": "string",
"title": "Send to Parser",
"description": "Parser to send the documents to while sending to index pipeline. (Defaults to same as index pipeline)",
"hints": [
"advanced"
]
},
"defineFieldsUsingInputSchema": {
"type": "boolean",
"title": "Define Fields in Solr?",
"description": "If true, define fields in Solr using the input schema; if a SQL transform is defined, the fields to define are based on the transformed DataFrame schema instead of the input.",
"default": true,
"hints": [
"advanced"
]
},
"atomicUpdates": {
"type": "boolean",
"title": "Send as Atomic Updates?",
"description": "Send documents to Solr as atomic updates; only applies if sending directly to Solr and not an index pipeline.",
"default": false,
"hints": [
"advanced"
]
},
"timestampFieldName": {
"type": "string",
"title": "Timestamp Field Name",
"description": "Name of the field that holds a timestamp for each document; only required if using timestamps to filter new rows from the input source.",
"hints": [
"advanced"
]
},
"clearDatasource": {
"type": "boolean",
"title": "Clear Existing Documents",
"description": "If true, delete any documents indexed in Solr by previous runs of this job. Default is false.",
"default": false,
"hints": [
"advanced"
]
},
"outputPartitions": {
"type": "integer",
"title": "Output Partitions",
"description": "Partition the input DataFrame into partitions before writing out to Solr or Fusion",
"hints": [
"advanced"
]
},
"optimizeOutput": {
"type": "integer",
"title": "Optimize",
"description": "Optimize the Solr collection down to the specified number of segments after writing to Solr.",
"hints": [
"advanced"
]
},
"cacheAfterRead": {
"type": "boolean",
"title": "Cache After Read",
"description": "Cache input data in memory (and disk as needed) after reading; default is false, setting to true may help stability of the job by reading all data from the input source first before transforming or writing to Solr. This could make the job run slower as it adds an intermediate write operation.",
"default": false,
"hints": [
"hidden"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output. For output formats other than solr or index-pipeline, format and path options can be specified here",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"transformScala": {
"type": "string",
"title": "Transform Scala",
"description": "Optional Scala script used to transform the results returned by the data source before indexing. You must define your transform script in a method with signature: def transform(inputDF: Dataset[Row]) : Dataset[Row]",
"hints": [
"advanced",
"lengthy",
"code/scala"
]
},
"mlModelId": {
"type": "string",
"title": "Spark ML PipelineModel ID",
"description": "The ID of the Spark ML PipelineModel stored in the Fusion blob store.",
"hints": [
"advanced"
],
"reference": "blob",
"blobType": "model:ml-model"
},
"transformSql": {
"type": "string",
"title": "Transform SQL",
"description": "Optional SQL used to transform the results returned by the data source before indexing. The input DataFrame returned from the data source will be registered as a temp table named '_input'. The Scala transform is applied before the SQL transform if both are provided, which allows you to define custom UDFs in the Scala script for use in your transformation SQL.",
"hints": [
"advanced",
"lengthy",
"code/sql"
]
},
"shellOptions": {
"type": "array",
"title": "Spark Shell Options",
"description": "Additional options to pass to the Spark shell when running this job.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"templateParams": {
"type": "array",
"title": "Interpreter Params",
"description": "Bind the key/values to the script interpreter",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"continueAfterFailure": {
"type": "boolean",
"title": "Continue after index failure",
"description": "If set to true, when a failure occurs when sending a document through an index pipeline, the job will continue onto the next document instead of failing",
"default": false,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"parallel-bulk-loader"
],
"default": "parallel-bulk-loader",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Outlier Detection",
"description": "Use this job when you want to find outliers from a set of documents and attach labels for each outlier group.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"uidField",
"outputCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing documents to be clustered",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to",
"minLength": 1
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"outlierGroupIdField": {
"type": "string",
"title": "Output Field Name for Outlier Group Id",
"description": "Output field name for unique outlier group id.",
"default": "outlier_group_id"
},
"outlierGroupLabelField": {
"type": "string",
"title": "Top Unique Terms Field Name",
"description": "Output field name for top frequent terms that are (mostly) unique for each outlier group as computed based on TF-IDF and group Id.",
"default": "outlier_group_label"
},
"outputOutliersOnly": {
"type": "boolean",
"title": "Only save outliers?",
"description": "If true, only outliers are saved in the output collection, otherwise, the whole dataset is saved.",
"default": false
},
"uidField": {
"type": "string",
"title": "ID Field Name",
"description": " Field containing the unique ID for each document.",
"default": "id",
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"lengthy",
"code/json"
],
"minLength": 1
},
"freqTermField": {
"type": "string",
"title": "Top Frequent Terms Field Name",
"description": "Output field name for top frequent terms in each cluster. These may overlap with other clusters.",
"default": "freq_terms"
},
"distToCenterField": {
"type": "string",
"title": "Output Field Name for doc distance to its cluster center",
"description": "Output field name for doc distance to its corresponding cluster center (measure how representative the doc is).",
"default": "dist_to_center"
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"minDF": {
"type": "number",
"title": "Min Doc Support",
"description": "Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 5
},
"maxDF": {
"type": "number",
"title": "Max Doc Support",
"description": "Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 0.75
},
"numKeywordsPerLabel": {
"type": "integer",
"title": "Number of Keywords for Each Cluster",
"description": "Number of Keywords needed for labeling each cluster.",
"default": 5
},
"outlierK": {
"type": "integer",
"title": "Number of outlier groups",
"description": "Number of clusters to help find outliers.",
"default": 10,
"hints": [
"advanced"
]
},
"outlierThreshold": {
"type": "number",
"title": "Outlier cutoff",
"description": "Identify as outlier group if less than this percent of total documents. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 0.01,
"hints": [
"advanced"
]
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"outlier_detection"
],
"default": "outlier_detection",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed",
"outputOutliersOnly"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"uidField",
"outlierGroupIdField",
"outlierGroupLabelField",
"freqTermField",
"distToCenterField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"outlierK",
"outlierThreshold",
"maxDF",
"minDF",
"norm",
"numKeywordsPerLabel"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "ALS Recommender (deprecated)",
"description": "Use this job when you want to compute user recommendations or item similarities using a collaborative filtering recommender. You can also implement a user-to-item recommender in the advanced section of this job’s configuration UI. Deprecated as of Fusion 5.2.0 and will be removed in a future release; use the BPR Recommender instead.",
"required": [
"id",
"trainingCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Recommender Training Collection",
"description": "User/Item preference collection (often a signals collection or signals aggregation collection)"
},
"outputCollection": {
"type": "string",
"title": "Items-for-users Recommendation Collection",
"description": "Collection to store batch-predicted user/item recommendations (if absent, none computed)"
},
"outputItemSimCollection": {
"type": "string",
"title": "Item-to-item Similarity Collection",
"description": "Collection to store batch-computed item/item similarities (if absent, none computed)"
},
"numRecs": {
"type": "integer",
"title": "Number of User Recommendations to Compute",
"description": "Batch compute and store this many item recommendations per user",
"default": 10
},
"numSims": {
"type": "integer",
"title": "Number of Item Similarites to Compute",
"description": "Batch compute and store this many item similarities per item",
"default": 10
},
"implicitRatings": {
"type": "boolean",
"title": "Implicit Preferences",
"description": "Treat training preferences as implicit signals of interest (i.e. clicks or other actions) as opposed to explicit item ratings",
"default": true
},
"deleteOldRecs": {
"type": "boolean",
"title": "Delete Old Recommendations",
"description": "Delete old recommendations after generating new recommendations.",
"default": true
},
"excludeFromDeleteFilter": {
"type": "string",
"title": "Exclude from Delete Filter",
"description": "If the 'Delete Old Recommendations' flag is enabled, then use this query filter to identify existing recommendation docs to exclude from delete. The filter should identify recommendation docs you want to keep.",
"hints": [
"advanced"
]
},
"outputUserRecsCollection": {
"type": "string",
"title": "Users-for-items Recommendation Collection",
"description": "Collection to store batch-predicted item/user recommendations (if absent, none computed)",
"hints": [
"advanced"
]
},
"numUserRecsPerItem": {
"type": "integer",
"title": "Number of Users to Recommend to each Item",
"description": "Batch compute and store this many user recommendations per item",
"default": 10,
"hints": [
"advanced"
]
},
"modelId": {
"type": "string",
"title": "Recommender Model ID",
"description": "Identifier for the recommender model. Will be used as the unique key when storing the model in Solr. If absent, it will default to the job ID.",
"hints": [
"advanced"
]
},
"saveModel": {
"type": "boolean",
"title": "Save Model in Solr",
"description": "Whether we should save the computed ALS model in Solr",
"default": false,
"hints": [
"advanced"
]
},
"modelCollection": {
"type": "string",
"title": "Model Collection",
"description": "Collection to load and store the computed model, if \"Save Model\" is true. Defaults to \"[app name]_recommender_models\"",
"hints": [
"advanced"
],
"minLength": 1
},
"alwaysTrain": {
"type": "boolean",
"title": "Force model re-training",
"description": "Even if a model with this modelId exists, re-train if set true",
"default": true,
"hints": [
"advanced"
]
},
"maxTrainingIterations": {
"type": "integer",
"title": "Maximum Training Iterations",
"description": "Maximum number of iterations to use when learning the matrix decomposition",
"default": 10,
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr query to filter training data (e.g. downsampling or selecting based on min. pref values)",
"default": "*:*",
"hints": [
"advanced"
]
},
"popularItemMin": {
"type": "integer",
"title": "Training Data Filter By Popular Items",
"description": "Items must have at least this # of unique users interacting with it to go into the sample",
"default": 2,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Downsample preferences for items (bounded to at least 2) by this fraction",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"userIdField": {
"type": "string",
"title": "Training Collection User Id Field",
"description": "Solr field name containing stored user ids",
"default": "user_id_s",
"hints": [
"advanced"
]
},
"itemIdField": {
"type": "string",
"title": "Training Collection Item Id Field",
"description": "Solr field name containing stored item ids",
"default": "item_id_s",
"hints": [
"advanced"
]
},
"weightField": {
"type": "string",
"title": "Training Collection Weight Field",
"description": "Solr field name containing stored weights or preferences the user has for that item",
"default": "weight_d",
"hints": [
"advanced"
]
},
"initialBlocks": {
"type": "integer",
"title": "Training Block Size",
"description": "Number of sub-matrix blocks to break the training data into (default: -1, for auto-sizing)",
"default": -1,
"hints": [
"hidden"
]
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Training DataFrame Config Options",
"description": "Additional Spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"initialRank": {
"type": "integer",
"title": "Recommender Rank",
"description": "Number of user/item factors in the recommender decomposition (or starting guess for it, if doing parameter grid search)",
"default": 100,
"hints": [
"advanced"
]
},
"initialAlpha": {
"type": "number",
"title": "Implicit Preference Confidence",
"description": "Confidence weight to give the implicit preferences (or starting guess, if doing parameter grid search)",
"default": 50,
"hints": [
"advanced"
]
},
"initialLambda": {
"type": "number",
"title": "Initial Lambda",
"description": "Smoothing parameter to avoid overfitting (or starting guess, if doing parameter grid search). Slightly larger value needed for small data sets",
"default": 0.01,
"hints": [
"advanced"
]
},
"gridSearchWidth": {
"type": "integer",
"title": "Grid Search Width",
"description": "Parameter grid search to be done centered around initial parameter guesses, exponential step size, this number of steps (if <= 0, no grid search). 1 is a reasonable number to start with.",
"default": 0,
"hints": [
"advanced"
]
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 13,
"hints": [
"advanced"
]
},
"itemMetadataFields": {
"type": "array",
"title": "Item Metadata Fields",
"description": "List of item metadata fields to include in the recommendation output documents.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"itemMetadataCollection": {
"type": "string",
"title": "Item Metadata Collection",
"description": "Fusion collection or catalog asset ID containing item metadata fields you want to add to the recommendation output documents.",
"hints": [
"advanced"
]
},
"itemMetadataJoinField": {
"type": "string",
"title": "Item Metadata Join Field",
"description": "Name of field in the item metadata collection to join on; defaults to the item id field configured for this job.",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format which training data comes in (like 'solr', 'hdfs', 'file', 'parquet' etc)",
"default": "solr",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"als_recommender"
],
"default": "als_recommender",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"outputUserRecsCollection",
"outputItemSimCollection",
"writeOptions"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"numSims",
"implicitRatings",
"deleteOldRecs"
]
},
{
"label": "Training Data Settings",
"properties": [
"trainingDataFilterQuery",
"popularItemMin",
"trainingSampleFraction",
"userIdField",
"itemIdField",
"weightField",
"maxIters",
"trainingDataFrameConfigOptions",
"initialBlocks"
]
},
{
"label": "Model Settings",
"properties": [
"modelId",
"saveModel",
"modelCollection",
"alwaysTrain"
]
},
{
"label": "Grid Search Settings",
"properties": [
"initialRank",
"gridSearchWidth",
"initialAlpha",
"initialLambda",
"randomSeed"
]
},
{
"label": "Item Metadata Settings",
"properties": [
"itemMetadataCollection",
"itemMetadataJoinField",
"itemMetadataFields"
]
}
]
},
{
"type": "object",
"title": "Upload Model Parameters To Cloud",
"description": "Upload a trained model's parameters to cloud storage",
"required": [
"id",
"modelName",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Seldon Core deployment to upload (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"cloudPath": {
"type": "string",
"title": "Cloud Path",
"description": "Path to cloud storage location that will contain the saved parameters for this model - the model version will be appended to the filename at the end of the path string. Supports S3, GCS, or Azure Blob Storage URIs"
},
"cloudSecret": {
"type": "string",
"title": "Kubernetes secret name for cloud storage access",
"description": "Defines the Kubernetes secret that will be used to access cloud storage"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-upload-model"
],
"default": "argo-upload-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Classification",
"description": "Trains a classification model to classify text documents by assigning a label to them.",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"textField",
"labelField",
"deployModelName",
"workflowType",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"stopwordsBlobName": {
"type": "string",
"title": "Stopwords Blob Store",
"description": "Name of the stopwords blob resource. This is a .txt file with one stopword per line. By default the file is called stopwords/stopwords_en.txt however a custom file can also be used. Check documentation for more details on format and uploading to blob store.",
"default": "stopwords/stopwords_en.txt",
"reference": "blob",
"blobType": "file:spark"
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"textField": {
"type": "string",
"title": "Training collection content field",
"description": "Solr field name containing the text to be classified",
"minLength": 1
},
"labelField": {
"type": "string",
"title": "Training collection class field",
"description": "Solr field name containing the classes/labels for the text",
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`.",
"hints": [
"code/sql",
"advanced"
]
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 12345,
"hints": [
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Choose a fraction of the data for training.",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"deployModelName": {
"type": "string",
"title": "Model Deployment Name",
"description": "Name of the model to be used for deployment (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"workflowType": {
"type": "string",
"title": "Method",
"description": "Method to be used for classification.",
"enum": [
"Logistic Regression",
"Starspace"
],
"default": "Logistic Regression"
},
"minCharLen": {
"type": "integer",
"title": "Minimum No. of Characters",
"description": "Minimum length, in characters, for the text to be included into training.",
"default": 2,
"minimum": 1,
"exclusiveMinimum": false
},
"maxCharLen": {
"type": "integer",
"title": "Maximum No. of Characters",
"description": "Maximum length, in characters, of the training text. Texts longer than this value will be truncated.",
"default": 100000,
"minimum": 1,
"exclusiveMinimum": false
},
"lowercaseTexts": {
"type": "boolean",
"title": "Lowercase Text",
"description": "Select if you want the text to be lowercased",
"default": true
},
"unidecodeTexts": {
"type": "boolean",
"title": "Unidecode Text",
"description": "Select if you want the text to be unidecoded",
"default": true
},
"minClassSize": {
"type": "integer",
"title": "Minimum no. of examples per class",
"description": "Minimum number of samples that class should have to be included into training. Otherwise the class and all its samples are dropped.",
"default": 5,
"minimum": 2,
"exclusiveMinimum": false
},
"valSize": {
"type": "number",
"title": "Validation set size",
"description": "Size of the validation dataset. Provide a float (0, 1) if you want to sample as a fraction, or an integer >= 1 if you want to sample exact number of records.",
"default": 0.1
},
"topK": {
"type": "integer",
"title": "Number of Output classes",
"description": "Number of most probable output classes to assign to each sample along with their scores.",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"featurizerType": {
"type": "string",
"title": "Featurizer",
"description": "The type of featurizer to use. TFIDF will compute both term-frequency and inverse document-frequency, whereas Count will use only term-frequency",
"enum": [
"tfidf",
"count"
],
"default": "tfidf",
"hints": [
"advanced"
]
},
"useCharacters": {
"type": "boolean",
"title": "Use Characters",
"description": "Whether to use the characters or word analyzer. Use words if the text is long. Using characters on long text can significantly increase vectorization time and memory requirements.",
"default": true
},
"tokenPattern": {
"type": "string",
"title": "Token filtering pattern",
"description": "Regex pattern for filtering tokens.",
"default": "(?u)\\b\\w\\w+\\b",
"hints": [
"hidden"
]
},
"minDf": {
"type": "number",
"title": "Min Document Frequency",
"description": "Minimum Df for token to be considered. Provide a float (0,1) if you want to specify as a fraction, otherwise integer >= 1 to specify the exact number of documents in which a token should occur.",
"default": 1,
"hints": [
"advanced"
]
},
"maxDf": {
"type": "number",
"title": "Max Document Frequency",
"description": "Maximum Df for token to be considered. Provide a float (0,1) if you want to specify as a fraction, otherwise integer >= 1 to specify the exact number of documents in which a token should occur",
"default": 0.8,
"hints": [
"advanced"
]
},
"minNgram": {
"type": "integer",
"title": "Min Ngram size",
"description": "Minimum word or character ngram size to be used.",
"minimum": 1,
"exclusiveMinimum": false
},
"maxNgram": {
"type": "integer",
"title": "Max Ngram size",
"description": "Maximum word or character ngram size to be used.",
"minimum": 1,
"exclusiveMinimum": false
},
"maxFeatures": {
"type": "integer",
"title": "Maximum Vocab Size",
"description": "Maximum number of tokens (including word or character ngrams) to consider for the vocabulary. Less frequent tokens will be omitted.",
"default": 250000,
"minimum": 1,
"exclusiveMinimum": false
},
"norm": {
"type": "string",
"title": "Use Norm",
"description": "Select the norm method to use.",
"enum": [
"None",
"L1",
"L2"
],
"default": "None",
"hints": [
"advanced"
]
},
"smoothIdf": {
"type": "boolean",
"title": "Smooth IDF",
"description": "Smooth IDF weights by adding one to document frequencies. Prevents zero divisions.",
"default": true,
"hints": [
"advanced"
]
},
"sublinearTf": {
"type": "boolean",
"title": "Sublinear TF",
"description": "Whether to apply sublinear scaling to TF, i.e. replace tf with 1 + log(tf). It usually helps when characters are used. ",
"default": true,
"hints": [
"advanced"
]
},
"scaling": {
"type": "boolean",
"title": "Scale Features",
"description": "Whether to apply Standard Scaling (X - mean(X)) / std(X) for the features. If the feature vector is sparse (no dimensionality reduction is used), then only division on standard deviation will be applied.",
"default": true
},
"dimReduction": {
"type": "boolean",
"title": "Perform Dimensionality Reduction",
"description": "Whether to perform dimensionality reduction or not. Truncated SVD is used to reduce dimensionality. Reduces overfitting and training time. Note that sparse vectors will become dense.",
"default": false
},
"dimReductionSize": {
"type": "integer",
"title": "Reduced Dimension Size",
"description": "The target dimension size of the features after dimensionality reduction.",
"default": 256,
"minimum": 1,
"exclusiveMinimum": false
},
"penalty": {
"type": "string",
"title": "Penalty",
"description": "Specify the norm used in the penalization. l2 is supported only by the ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers. ‘elasticnet’ is only supported by the ‘saga’ solver. Select none, if you don't want to regularize (this is not supported by the `liblinear` solver).",
"enum": [
"l1",
"l2",
"elsaticnet",
"none"
],
"default": "l2",
"hints": [
"advanced"
]
},
"l1Ratio": {
"type": "number",
"title": "L1 penalty ratio",
"description": "Only used with the `elasticnet` penalty. If its value = 0, l2 penalty will be used. If it's value = 1, l1 penalty will be used. A value in between will use the appropirate ratio of l1 and l2 penalties.",
"default": 0.5,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"tol": {
"type": "number",
"title": "Stopping tolerance",
"description": "Tolerance for stopping criteria.",
"default": 0.0001
},
"reg": {
"type": "number",
"title": "Regularization term",
"description": "This is the inverse of regularization strength. Smaller values result in stronger regularization.",
"default": 1
},
"useClassWeights": {
"type": "boolean",
"title": "Use class weights",
"description": "If true, a weight is applied to each class inversely proportional to its frequency.",
"default": false
},
"solver": {
"type": "string",
"title": "Optimization Algorithm",
"description": "The optimization algorithm to use to fit to the data. LBFGS and SAGA are good initial choices.",
"enum": [
"lbfgs",
"newton-cg",
"liblinear",
"sag",
"saga"
],
"default": "lbfgs",
"hints": [
"advanced"
]
},
"multiClass": {
"type": "string",
"title": "Loss Method",
"description": "Whether to train a binary classifier for each class or use a multinomial loss. ‘auto’ selects ‘ovr’ if the data is binary, or if algorithm=’liblinear’, and otherwise selects ‘multinomial’.",
"enum": [
"auto",
"ovr",
"multinomial"
],
"default": "auto",
"hints": [
"advanced"
]
},
"maxIter": {
"type": "integer",
"title": "Maximum iterations for algorithm",
"description": "Maximum number of iterations taken for the optimization algorithm to converge.",
"default": 200,
"minimum": 1,
"exclusiveMinimum": false
},
"textLayersSizes": {
"type": "string",
"title": "Hidden sizes before text embedding",
"description": "Sizes of hidden layers before the embedding layer for text. Specify as a list of numbers for multiple layers or a single number for 1 layer. Leave blank if no hidden layers are required.",
"default": "[256, 128]",
"pattern": "^(\\[(((\\d)*,\\s*)*(\\d+)+)?\\])?$"
},
"labelLayersSizes": {
"type": "string",
"title": "Hidden sizes before class embedding",
"description": "Sizes of hidden layers before the embedding layer for classes. Specify as a list of numbers for multiple layers or a single number for 1 layer. Leave blank if no hidden layers are required.",
"default": "[]",
"pattern": "^(\\[(((\\d)*,\\s*)*(\\d+)+)?\\])?$"
},
"embeddingsSize": {
"type": "integer",
"title": "Embedding size",
"description": "Dimension size of final embedding vectors for text and class.",
"default": 100,
"minimum": 1,
"exclusiveMinimum": false
},
"regTerm": {
"type": "number",
"title": "Regularization Term",
"description": "Scale of L2 regularization",
"default": 0.002
},
"dropout": {
"type": "number",
"title": "Dropout",
"description": "Probability for applying dropout regularization.",
"default": 0.2
},
"embeddingReg": {
"type": "number",
"title": "Embedding regularization",
"description": "The scale of how critical the algorithm should be of minimizing the maximum similarity between embeddings of different classes",
"default": 0.8,
"hints": [
"advanced"
]
},
"minBatchSize": {
"type": "integer",
"title": "Minimum Batch Size",
"description": "The smallest batch size with which to start training. Batch size will be increased linearly every epoch, upto the maximum batch size specified.",
"default": 64,
"minimum": 1,
"exclusiveMinimum": false
},
"maxBatchSize": {
"type": "integer",
"title": "Maximum Batch Size",
"description": "The largest batch size to use during training. Batch size will be increased linearly every epoch, upto the maximum batch size specified.",
"default": 128,
"minimum": 1,
"exclusiveMinimum": false
},
"numEpochs": {
"type": "integer",
"title": "Number of training epochs",
"description": "Number of epochs for which to train the model.",
"default": 40,
"minimum": 1,
"exclusiveMinimum": false
},
"muPos": {
"type": "number",
"title": "Maximum correct class similarity",
"description": "How similar algorithm should try to make embedding vectors for correct classes. The algorithm will try to maximize similarities so that it's higher than the value specified here.",
"default": 0.8,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"muNeg": {
"type": "number",
"title": "Maximum negative class similarity",
"description": "How similar algorithm should try to make embedding vectors for negative classes. The algorithm will try to minimize similarities so that it's lower than the value specified here.",
"default": -0.4,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"similarityType": {
"type": "string",
"title": "Similarity type",
"description": "Type of similarity to use to compare the embedded vectors.",
"enum": [
"cosine",
"inner"
],
"default": "cosine",
"hints": [
"advanced"
]
},
"numNeg": {
"type": "integer",
"title": "Number of negative classes for training",
"description": "Number of negative classes to use during training to minimize their similarity to the input text. Should be less than the total number of classes.",
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"useMaxNegSim": {
"type": "boolean",
"title": "Only minimize max. negative similarity",
"description": "If true, only the maximum similarity for negative classes will be minimized. If unchecked, all negative similarities will be used.",
"default": true,
"hints": [
"advanced"
]
},
"modelReplicas": {
"type": "integer",
"title": "Model replicas",
"description": "How many replicas of the model should be deployed by Seldon Core",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-classification"
],
"default": "argo-classification",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"deployModelName",
"trainingCollection",
"trainingFormat",
"modelReplicas",
"secretName"
]
},
{
"label": "Training Data Settings",
"properties": [
"trainingDataFilterQuery",
"trainingSampleFraction",
"randomSeed",
"textField",
"labelField"
]
},
{
"label": "Preprocessing Parameters",
"properties": [
"minCharLen",
"maxCharLen",
"minClassSize",
"lowercaseTexts",
"unidecodeTexts"
]
},
{
"label": "Eval and Output Parameters",
"properties": [
"valSize",
"topK"
]
},
{
"label": "Vectorization Parameters",
"properties": [
"featurizerType",
"useCharacters",
"stopwordsBlobName",
"minDf",
"maxDf",
"minNgram",
"maxNgram",
"maxFeatures",
"norm",
"smoothIdf",
"sublinearTf",
"scaling",
"dimReduction",
"dimReductionSize"
]
},
{
"label": "Logistic Regression Parameters",
"properties": [
"penalty",
"l1Ratio",
"tol",
"reg",
"useClassWeights",
"solver",
"multiClass",
"maxIter"
]
},
{
"label": "Starspace Parameters",
"properties": [
"textLayersSizes",
"labelLayersSizes",
"embeddingsSize",
"regTerm",
"dropout",
"embeddingReg",
"minBatchSize",
"maxBatchSize",
"numEpochs",
"muPos",
"muNeg",
"similarityType",
"numNeg",
"useMaxNegSim"
]
}
]
},
{
"type": "object",
"title": "Delete Ray Model Deployment",
"description": "Removes a Ray model deployment from the cluster",
"required": [
"id",
"modelName",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Ray deployment to delete",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-delete-ray-model"
],
"default": "argo-delete-ray-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Delete Collections in Milvus (deprecated)",
"description": "Deletes specified collections in Milvus",
"required": [
"id",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"collections-list": {
"type": "array",
"title": "Collections",
"description": "List of collections in Milvus that should be deleted.",
"items": {
"type": "string"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-milvus-delete-collections"
],
"default": "argo-milvus-delete-collections",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Ground Truth",
"description": "Use this job when you want to estimate ground truth queries using click and query signals with document relevance per query determined using a click/skip formula. Pair this job with ranking metrics job to calculate relevance metrics, such as nDCG",
"required": [
"id",
"signalsCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"signalsCollection": {
"type": "string",
"title": "Signals collection",
"description": "Collection containing click signals and the associated search log identifier",
"minLength": 1
},
"searchLogsAddOpts": {
"type": "object",
"title": "Search Logs and Options",
"description": "Additional options to use while loading search logs collection",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"signalsAddOpts": {
"type": "object",
"title": "Additional Signals Options",
"description": "Additional options to use while loading signals collection",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"searchLogsPipeline": {
"type": "string",
"title": "Search Logs Pipeline",
"description": "Pipeline id associated with search log entries",
"hints": [
"advanced"
],
"minLength": 1
},
"joinKeySearchLogs": {
"type": "string",
"title": "Join Key (Query Signals)",
"description": "Join key of query signals in the signals collection",
"default": "id",
"hints": [
"advanced"
]
},
"joinKeySignals": {
"type": "string",
"title": "Join Key (Click Signals)",
"description": "Join key of click signals in the signals collection",
"default": "fusion_query_id",
"hints": [
"advanced"
]
},
"filterQueries": {
"type": "array",
"title": "Filter Queries",
"description": "Filter queries to apply while choosing top queries from query signals in signals collection",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"topQueriesLimit": {
"type": "integer",
"title": "Top Queries Limit",
"description": "Total number of queries to pick for Ground truth calculations",
"default": 100,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"ground_truth"
],
"default": "ground_truth",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"signalsCollection"
]
},
{
"label": "Additional Options",
"properties": [
"searchLogsPipeline",
"joinKeySearchLogs",
"joinKeySignals",
"searchLogsAddOpts",
"signalsAddOpts",
"filterQueries",
"topQueriesLimit"
]
}
]
},
{
"type": "object",
"title": "Create Seldon Core Model Deployment",
"description": "Deploys a Seldon Core Model into the Fusion cluster",
"required": [
"id",
"deployModelName",
"modelDockerRepo",
"modelDockerImage",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"deployModelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Seldon Core deployment to deploy (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"modelReplicas": {
"type": "integer",
"title": "Model replicas",
"description": "How many replicas of the model should be deployed by Seldon Core",
"default": 1
},
"modelDockerRepo": {
"type": "string",
"title": "Docker repository",
"description": "Defines the Docker repository where the model image is located."
},
"modelDockerImage": {
"type": "string",
"title": "Image name",
"description": "Name of the model's docker image"
},
"modelDockerSecret": {
"type": "string",
"title": "Kubernetes secret name for model repo",
"description": "Defines the Kubernetes secret to be used with the Docker repository"
},
"columnNames": {
"type": "string",
"title": "Output column names for model",
"description": "A list of column names that the model generates which the ML Service will return after inference.",
"default": "[output1, output2]"
},
"cloudPath": {
"type": "string",
"title": "Cloud Path",
"description": "Path to cloud storage location that contains the saved parameters for this model. Supports S3, GCS, or Azure Blob Storage URIs",
"hints": [
"advanced"
]
},
"cloudSecret": {
"type": "string",
"title": "Kubernetes secret name for cloud storage access",
"description": "Defines the Kubernetes secret that will be used to access cloud storage",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-deploy-model"
],
"default": "argo-deploy-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Trending Recommender",
"description": "Trending Recommender",
"required": [
"id",
"trainingCollection",
"dataFormat",
"refTimeRange",
"targetTimeRange",
"countField",
"typeField",
"timeField",
"docIdField",
"types",
"recsCount",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Solr Fields to Read",
"description": "Fields to extract from Solr (not used for other formats)",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"refTimeRange": {
"type": "integer",
"title": "Reference Time Days",
"description": "Number of reference days: number of days to use as baseline to find trends (calculated from today)"
},
"targetTimeRange": {
"type": "integer",
"title": "Target Time Days",
"description": "Number of target days: number of days to use as target to find trends (calculated from today)"
},
"numWeeksRef": {
"type": "number",
"title": "Num Weeks Reference",
"description": "If using filter queries for reference and target time ranges, enter the value of (reference days / target days) here (if not using filter queries, this will be calculated automatically)",
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"countField": {
"type": "string",
"title": "Event Count Field Name",
"description": "Field containing the number of times an event (e.g. click) occurs for a particular query; count_i in the raw signal collection or aggr_count_i in the aggregated signal collection.",
"default": "aggr_count_i",
"minLength": 1
},
"referenceTimeFilterQuery": {
"type": "string",
"title": "Reference Filter Time Query",
"description": "Add a Spark SQL filter query here for greater control of time filtering",
"hints": [
"advanced"
]
},
"targetFilterTimeQuery": {
"type": "string",
"title": "Target Filter Time Query",
"description": "Add a Spark SQL filter query here for greater control of time filtering",
"hints": [
"advanced"
]
},
"typeField": {
"type": "string",
"title": "Type field",
"description": "Enter type field (default is type)",
"default": "aggr_type_s"
},
"timeField": {
"type": "string",
"title": "Time field",
"description": "Enter time field (default is timestamp_tdt)",
"default": "timestamp_tdt"
},
"docIdField": {
"type": "string",
"title": "Document ID field",
"description": "Enter document id field (default is doc_id)",
"default": "doc_id_s"
},
"types": {
"type": "string",
"title": "Event types",
"description": "Enter a comma-separated list of event types to filter on",
"default": "click,add"
},
"recsCount": {
"type": "integer",
"title": "Recommendation Count",
"description": "Maximum number of recs to generate (or -1 for no limit)",
"default": 500
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"trending-recommender"
],
"default": "trending-recommender",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"countField"
]
}
]
},
{
"type": "object",
"title": "Smart Answers Evaluate Pipeline",
"description": "Evaluates performance of a configured pipeline",
"required": [
"id",
"inputEvaluationCollection",
"trainingFormat",
"outputEvaluationCollection",
"outputFormat",
"appName",
"queryPipelineName",
"collectionName",
"returnFields",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"inputEvaluationCollection": {
"type": "string",
"title": "Input Evaluation Data Path",
"description": "Cloud storage path or Solr collection to pull labeled data for use in evaluation",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Input data format",
"description": "The format of the input data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"outputEvaluationCollection": {
"type": "string",
"title": "Output Evaluation Data Path",
"description": "Cloud storage path or Solr collection to store evaluation results (recommended collection is job_reports)",
"minLength": 1
},
"partitionFields": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"batchSize": {
"type": "string",
"title": "Output Batch Size",
"description": "If writing to solr, this field defines the batch size for documents to be pushed to solr.",
"hints": [
"advanced"
]
},
"outputFormat": {
"type": "string",
"title": "Output format",
"description": "The format of the output data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
"hints": [
"code/sql",
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Sampling proportion",
"description": "The proportion of data to be sampled from the full dataset. Use a value between 0 and 1 for a proportion (e.g. 0.5 for 50%), or for a specific number of examples, use an integer larger than 1. Leave blank for no sampling",
"hints": [
"advanced"
]
},
"seed": {
"type": "integer",
"title": "Sampling Seed",
"description": "Random seed for sampling",
"default": 12345,
"hints": [
"advanced"
]
},
"testQuestionFieldInFile": {
"type": "string",
"title": "Test Question Field",
"description": "Defines the field in the collection containing the test question",
"default": "question"
},
"matchFieldInFile": {
"type": "string",
"title": "Ground Truth Field",
"description": "Field which contains id or text of the ground truth answer in the evaluation collection",
"default": "answer_id"
},
"matchFieldInFusion": {
"type": "string",
"title": "Answer or id Field in Fusion",
"description": "Field name in Fusion which contains answer id or text for matching ground truth answer id or text in the evaluation collection",
"default": "doc_id"
},
"appName": {
"type": "string",
"title": "App name",
"description": "Fusion app where indexed documents or QA pairs live."
},
"queryPipelineName": {
"type": "string",
"title": "Fusion Query Pipeline",
"description": "Configured query pipeline name that should be used for evaluation"
},
"collectionName": {
"type": "string",
"title": "Main Collection",
"description": "Fusion collection where indexed documents or QA pairs live"
},
"additionalParams": {
"type": "string",
"title": "Additional query parameters",
"description": "Additional query parameters to pass to return resultsfrom Fusion. Please specify in dictionary format: e.g. { \"rowsFromSolrToRerank\": 20,\"fq\": \"type:answer\" }\"",
"hints": [
"advanced"
]
},
"returnFields": {
"type": "string",
"title": "Return fields",
"description": "Fields (comma-separated) that should be returned from the main collection (e.g. question, answer). The job will add them to the output evaluation"
},
"rankingScoreField": {
"type": "string",
"title": "Ranking score",
"description": "Score to be used for ranking and evaluation",
"default": "ensemble_score",
"hints": [
"advanced"
]
},
"metricsList": {
"type": "string",
"title": "Metrics list",
"description": "List of metrics that should be computed during evaluation. e.g.[\"recall\",\"precision\",\"map\",\"mrr\"]",
"default": "[\"recall\",\"map\",\"mrr\"]",
"hints": [
"advanced"
]
},
"kList": {
"type": "string",
"title": "Metrics@k list",
"description": "The k retrieval position that will be used to compute for each metric",
"default": "[1,3,5]",
"hints": [
"advanced"
]
},
"doWeightsSelection": {
"type": "boolean",
"title": "Perform weights selection",
"description": "Whether to perform grid search to find the best weights combination for ranking scores for query pipeline's Compute Mathematical Expression stage\"",
"default": false,
"hints": [
"advanced"
]
},
"solrScaleFunc": {
"type": "string",
"title": "Solr scale function",
"description": "Function used in the pipeline to scale Solr scores. E.g., scale by max Solr score retrieved (max), scale by log with base 10 (log10) or take squre root of score (pow0.5)",
"default": "max"
},
"scoreListForWeights": {
"type": "string",
"title": "List of ranking scores for ensemble",
"description": "Ranking scores (comma-separated) used for ensemble in the query pipeline's Compute Mathematical Expression stage. The job will perform weights selection for the listed scores",
"default": "score,vectors_distance"
},
"targetRankingMetric": {
"type": "string",
"title": "Target metric to use for weight selection",
"description": "Target ranking metric to optimize during weights selection",
"default": "mrr@3"
},
"fetcherType": {
"type": "string",
"title": "Fetcher Type to use with query evaluation",
"default": "query-service",
"hints": [
"hidden"
]
},
"useLabelingResolution": {
"type": "boolean",
"title": "Use Labeling Resolution",
"description": "Check this to determine similar questions and similar answers via labeling resolution and graph connected components. Does not work well with signals data.",
"default": false,
"hints": [
"advanced"
]
},
"useConcurrentQuerying": {
"type": "boolean",
"title": "Use Concurrent Querying",
"description": "Check this option if you want to make concurrent queries to Fusion. It will greatly speed up the job at the cost of increased load on Fusion. Use with caution.",
"default": false,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-qna-evaluate"
],
"default": "argo-qna-evaluate",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input / Output Parameters",
"properties": [
"inputEvaluationCollection",
"trainingFormat",
"outputEvaluationCollection",
"outputFormat",
"trainingDataFilterQuery",
"testQuestionFieldInFile",
"matchFieldInFile",
"trainingSampleFraction",
"seed",
"useLabelingResolution",
"partitionFields",
"batchSize",
"secretName"
]
},
{
"label": "Query Pipeline Input / Output Parameters",
"properties": [
"appName",
"collectionName",
"queryPipelineName",
"matchFieldInFusion",
"additionalParams",
"returnFields",
"useConcurrentQuerying"
]
},
{
"label": "Metrics",
"properties": [
"rankingScoreField",
"metricsList",
"kList",
"doWeightsSelection",
"solrScaleFunc",
"scoreListForWeights",
"targetRankingMetric"
]
}
]
},
{
"type": "object",
"title": "Script",
"description": "Run a custom Scala script as a Fusion Job.",
"required": [
"id",
"script",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"script": {
"type": "string",
"title": "Scala Script",
"description": "Custom script written in Scala to be executed in Fusion as a Spark job.",
"hints": [
"lengthy",
"code/scala"
],
"minLength": 1
},
"shellOptions": {
"type": "array",
"title": "Spark Shell Options",
"description": "Additional options to pass to the Spark shell when running this job.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"interpreterParams": {
"type": "array",
"title": "Interpreter Params",
"description": "Bind the key/values to the Scala interpreter",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"script"
],
"default": "script",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Synonym Detection (Deprecated)",
"description": "Use this job to generate synonym and similar query pairs. This job is deprecated.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"misspellingSQLDataFormat",
"phraseSQLDataFormat",
"countField",
"docIdField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Input Collection",
"description": "Collection containing queries, document id and event counts. Can be either signal aggregation collection or raw signals collection.",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Query Field Name",
"description": "Field containing queries. Change to query to use against raw signals",
"default": "query_s",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Data filter query",
"description": "Solr query to use when loading training data if using Solr, Spark SQL expression for all other data sources",
"default": "*:*",
"hints": [
"dummy"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Collection to store synonym and similar query pairs.",
"hints": [
"dummy"
]
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"misspellingCollection": {
"type": "string",
"title": "Misspelling Job Result Collection",
"description": "Solr collection containing reviewed result of Token and phrase spell correction job. Defaults to the query_rewrite_staging collection for the app."
},
"misspellingsFilterQuery": {
"type": "string",
"title": "Misspelling Job Result Filter Query",
"description": "Solr query to additionally filter the misspelling results. Defaults to reading all approved spell corrections.",
"default": "type:spell"
},
"keyPhraseCollection": {
"type": "string",
"title": "Phrase Extraction Job Result Collection",
"description": "Solr collection containing reviewed result of Phrase extraction job. Defaults to the query_rewrite_staging collection for the app."
},
"keyPhraseFilterQuery": {
"type": "string",
"title": "Phrase Extraction Job Result Filter Query",
"description": "Solr query to additionally filter the phrase extraction results. Defaults to reading all approved phrases.",
"default": "type:phrase"
},
"misspellingSQL": {
"type": "string",
"title": "Spark SQL filter query for misspelling data",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spell_input",
"default": "SELECT surface_form AS misspelling_s, output AS correction_s FROM spell_input WHERE doc_type = 'query_rewrite' AND type = 'spell' AND review IN ('approved' OR 'auto')",
"hints": [
"code/sql",
"advanced"
]
},
"misspellingSQLDataFormat": {
"type": "string",
"title": "Misspelling Data format",
"description": "Spark-compatible format that contains spelling data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"phraseSQL": {
"type": "string",
"title": "Spark SQL filter query for phrase data",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as phrase_input",
"default": "SELECT surface_form AS phrases_s, coalesce(confidence, lit(1d)) AS likelihood_d, coalesce(word_count, lit(1d)) AS word_num_i FROM phrase_input WHERE doc_type = 'query_rewrite' AND type = 'phrase' AND review IN ('approved' OR 'auto')",
"hints": [
"code/sql",
"advanced"
]
},
"phraseSQLDataFormat": {
"type": "string",
"title": "Phrase Data format",
"description": "Spark-compatible format that contains phrase data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"countField": {
"type": "string",
"title": "Event Count Field Name",
"description": "Solr field containing number of events (e.g., number of clicks). Change to count_i when running against raw signals",
"default": "aggr_count_i"
},
"docIdField": {
"type": "string",
"title": "Document id Field Name",
"description": "Solr field containing document id that user clicked. Change to doc_id for raw signal collection",
"default": "doc_id_s "
},
"overlapThreshold": {
"type": "number",
"title": "Query Similarity Threshold",
"description": "The threshold above which query pairs are consider similar. We can get more synonym pairs if increase this value but quality may get reduced.",
"default": 0.5,
"hints": [
"advanced"
]
},
"similarityThreshold": {
"type": "number",
"title": "Synonym Similarity Threshold",
"description": "The threshold above which synonym pairs are consider similar. We can get more synonym pairs if increase this value but quality may get reduced.",
"default": 0.01,
"hints": [
"advanced"
]
},
"minQueryCount": {
"type": "integer",
"title": "Query Clicks Threshold",
"description": "The min number of clicked documents needed for comparing queries.",
"default": 5,
"hints": [
"advanced"
]
},
"keywordsBlobName": {
"type": "string",
"title": "Keywords Blob Store",
"description": "Name of the keywords blob resource. Typically, this should be a csv file uploaded to blob store in a specific format. Check documentation for more details on format and uploading to blob store.",
"reference": "blob",
"blobType": "file:spark"
},
"synonymBlobName": {
"type": "string",
"title": "Custom Synonym Blob Store",
"description": "Name of the custom synonym blob resource. This is a Solr synonym file that will be used in the synonym detection job and will override any generated synonyms (indicated by a 'supplied' field in the Rules UI).",
"hints": [
"advanced"
],
"reference": "blob",
"blobType": "file:spark"
},
"analyzerConfigQuery": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenizing queries (JSON-encoded)",
"default": "{ \"analyzers\": [ { \"name\": \"LetterTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"letter\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"KStem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"LetterTokLowerStem\" } ]}",
"hints": [
"lengthy",
"advanced",
"code/json"
],
"minLength": 1
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"enableAutoPublish": {
"type": "boolean",
"title": "Enable auto-publishing",
"description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
"default": false,
"hints": [
"advanced"
]
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"synonymDetection"
],
"default": "synonymDetection",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"countField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"overlapThreshold"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfigQuery"
]
},
{
"label": "Misc. Parameters",
"properties": [
"keywordsBlobName"
]
}
]
},
{
"type": "object",
"title": "Cluster Labeling",
"description": "Use this job when you already have clusters or well-defined document categories, and you want to discover and attach keywords to see representative words within those existing clusters. (If you want to create new clusters, use the Document Clustering job.)",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"clusterIdField",
"outputCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing documents with defined categories or clusters",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to detect keywords from",
"description": "Field containing data from which to discover keywords for the cluster",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store output data to",
"minLength": 1
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"clusterIdField": {
"type": "string",
"title": "Existing Document Category Field",
"description": "Field that contains your existing cluster IDs or document categories.",
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"lengthy",
"code/json"
],
"minLength": 1
},
"clusterLabelField": {
"type": "string",
"title": "Top Unique Terms Field Name",
"description": "Output field name for top frequent terms that are (mostly) unique for each cluster.",
"default": "cluster_label"
},
"freqTermField": {
"type": "string",
"title": "Top Frequent Terms Field Name",
"description": "Output field name for top frequent terms in each cluster. These may overlap with other clusters.",
"default": "freq_terms"
},
"minDF": {
"type": "number",
"title": "Min Doc Support",
"description": "Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 5
},
"maxDF": {
"type": "number",
"title": "Max Doc Support",
"description": "Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
"default": 0.75
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"numKeywordsPerLabel": {
"type": "integer",
"title": "Number of Keywords for Each Cluster",
"description": "Number of Keywords needed for labeling each cluster.",
"default": 5
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"cluster_labeling"
],
"default": "cluster_labeling",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"clusterIdField",
"freqTermField",
"clusterLabelField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"maxDF",
"minDF",
"norm",
"numKeywordsPerLabel"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Build Training Data",
"description": "Use this job to build training data for query classification by joining signals with catalog.",
"required": [
"id",
"fieldToVectorize",
"catalogPath",
"catalogFormat",
"signalsPath",
"outputPath",
"categoryField",
"catalogIdField",
"itemIdField",
"countField",
"analyzerConfig",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"hints": [
"dummy",
"hidden"
],
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Query Field",
"description": "Field containing query strings.",
"default": "query_s",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Signals Format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"hints": [
"dummy"
],
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Signal Data Filter Query",
"description": "Solr query to additionally filter signals. For non-solr data source use SPARK SQL FILTER QUERY under Advanced to filter results",
"default": "*:*",
"hints": [
"dummy"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to",
"hints": [
"dummy",
"hidden"
]
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"dummy"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"dummy",
"hidden"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"catalogPath": {
"type": "string",
"title": "Catalog Path",
"description": "Catalog collection or cloud storage path which contains item categories."
},
"catalogFormat": {
"type": "string",
"title": "Catalog Format",
"description": "Spark-compatible format that contains catalog data (like 'solr', 'parquet', 'orc' etc)"
},
"signalsPath": {
"type": "string",
"title": "Signals Path",
"description": "Signals collection or cloud storage path which contains item categories."
},
"outputPath": {
"type": "string",
"title": "Output Path",
"description": "Output collection or cloud storage path which contains item categories."
},
"categoryField": {
"type": "string",
"title": "Category Field in Catalog",
"description": "Item category field in catalog."
},
"catalogIdField": {
"type": "string",
"title": "Item Id Field in Catalog",
"description": "Item Id field in catalog, which will be used to join with signals"
},
"itemIdField": {
"type": "string",
"title": "Item Id Field in Signals",
"description": "Item Id field in signals, which will be used to join with catalog.",
"default": "doc_id_s"
},
"countField": {
"type": "string",
"title": "Count Field in Signals",
"description": "Count Field in raw or aggregated signals.",
"default": "aggr_count_i"
},
"topCategoryProportion": {
"type": "number",
"title": "Top Category Proportion",
"description": "Proportion of the top category has to be among all categories.",
"default": 0.5
},
"topCategoryThreshold": {
"type": "integer",
"title": "Minimum Count",
"description": "Minimum number of query,category pair counts.",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Text Analyzer",
"description": "The style of text analyzer you would like to use.",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"lengthy",
"code/json"
]
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"build-training"
],
"default": "build-training",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed",
"catalogPath",
"catalogFormat",
"signalsPath",
"outputPath",
"dataOutputFormat",
"partitionCols",
"sparkSQL"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"categoryField",
"catalogIdField",
"itemIdField",
"countField"
]
},
{
"label": "Training Parameters",
"properties": [
"topCategoryProportion",
"topCategoryThreshold"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
}
]
},
{
"type": "object",
"title": "Transfer Collection To Cloud",
"description": "Transfer Collection to Cloud Storage, for collections that need to be migrated or copied to cloud storage",
"required": [
"id",
"inputCollection",
"outputLocation",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"inputCollection": {
"type": "string",
"title": "Collection",
"description": "Solr collection to copy",
"minLength": 1
},
"outputLocation": {
"type": "string",
"title": "Output Location",
"description": "URI of output location (e.g. s3a://..., gs://..., wasb://...)",
"minLength": 1
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true
},
"outputFormat": {
"type": "string",
"title": "Output format",
"description": "Format for cloud output (e.g. parquet, json, csv)",
"default": "parquet"
},
"sparkPartitions": {
"type": "integer",
"title": "Set minimum Spark partitions for input",
"description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
"default": 200,
"hints": [
"advanced"
]
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"transfer"
],
"default": "transfer",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "BPR Recommender",
"description": "Use this job when you want to compute user recommendations or item similarities using a Bayesian Personalized Ranking recommender. You can also implement a user-to-item recommender in the advanced section of this job’s configuration UI.",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"outputFormat",
"userIdField",
"itemIdField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"outputBatchSize": {
"type": "string",
"title": "Output Batch Size",
"description": "Batch size of documents when pushing results to solr",
"default": "15000",
"hints": [
"advanced"
]
},
"jobRunName": {
"type": "string",
"title": "Job Run Name",
"description": "Identifier for this job run. Use it to filter recommendations from particular runs.",
"hints": [
"advanced"
]
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"outputUserRecsCollection": {
"type": "string",
"title": "Items-Users Output Path",
"description": "Solr collection or cloud storage path to store batch-predicted user/item recommendations (if absent, none computed). Specify at least one of Items-Users Output Collection or Items-Items Output Collection.",
"minLength": 1
},
"outputItemSimCollection": {
"type": "string",
"title": "Item-Items Output Path",
"description": "Solr collection or cloud storage path to store batch-computed item/item similarities (if absent, none computed). Specify at least one of Items-Users Output Collection or Items-Items Output Collection.",
"minLength": 1
},
"outputFormat": {
"type": "string",
"title": "Output data format",
"description": "The format of the output data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"partitionFields": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output.",
"hints": [
"advanced"
]
},
"numRecsPerUser": {
"type": "integer",
"title": "No. of recs per user",
"description": "Number of recommendations that will be saved per user.",
"default": 10,
"minimum": 0,
"exclusiveMinimum": false
},
"userTopkAnn": {
"type": "integer",
"title": "No. of User Recs to Compute for Filtering",
"description": "Applies only when Filter Already Clicked Items is enabled. This is used to fetch additional recommendations so that the value specified for the Number of Recommendations Per User is most likely satisfied with filtering turned on.",
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"numSimsPerItem": {
"type": "integer",
"title": "No. of recs per item",
"description": "Number of recommendations that will be saved per item.",
"default": 10,
"minimum": 0,
"exclusiveMinimum": false
},
"deleteOldRecs": {
"type": "boolean",
"title": "Delete Old Recommendations",
"description": "Should previous recommendations be deleted. If this box is unchecked, then old recommendations will not be deleted but new recommendations will be appended with a different Job ID. Both sets of recommendations will be contained within the same collection. Will only work when output path is solr.",
"default": true
},
"excludeFromDeleteFilter": {
"type": "string",
"title": "Exclude from Delete Filter",
"description": "If the 'Delete Old Recommendations' flag is enabled, then use this query filter to identify existing recommendation docs to exclude from delete. The filter should identify recommendation docs you want to keep.",
"hints": [
"advanced"
]
},
"filterClicked": {
"type": "boolean",
"title": "Filter already clicked items",
"description": "Whether to filter out already clicked items in item recommendations for user. Takes more time but drastically improves quality.",
"default": true,
"hints": [
"advanced"
]
},
"weightField": {
"type": "string",
"title": "Training Collection Counts/Weights Field",
"description": "Solr field name containing stored counts/weights the user has for that item. This field is used as weight during training",
"default": "aggr_count_i"
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`.",
"hints": [
"code/sql",
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Choose a fraction of the data for training.",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"userIdField": {
"type": "string",
"title": "Training Collection User Id Field",
"description": "Solr field name in the training collection that contains stored User ID.",
"default": "user_id_s",
"minLength": 1
},
"itemIdField": {
"type": "string",
"title": "Training Collection Item Id Field",
"description": "Solr field name in the training collection that contains stored Item ID.",
"default": "item_id_s",
"minLength": 1
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 12345,
"hints": [
"advanced"
]
},
"itemMetadataFields": {
"type": "array",
"title": "Item Metadata Fields",
"description": "List of item metadata fields to include in the recommendation output documents. WARNING: Adding many fields can lead to huge output sizes or OOM issues.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"itemMetadataCollection": {
"type": "string",
"title": "Item Metadata Path",
"description": "Cloud storage path or Solr collection containing item metadata fields you want to add to the recommendation output documents. Leave blank and fill in the metadata fields if you want to fetch data from the training collection. Join field needs to be specified.",
"hints": [
"advanced"
]
},
"itemMetadataFormat": {
"type": "string",
"title": "Metdata format",
"description": "The format of the metadata - solr, parquet etc.",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"itemMetadataJoinField": {
"type": "string",
"title": "Item Metadata Join Field",
"description": "Name of field in the item metadata collection to join on.",
"hints": [
"advanced"
]
},
"performANN": {
"type": "boolean",
"title": "Perform approximate nearest neighbor search",
"description": "Whether to perform approximate nearest neighbor search (ANN). ANN will drastically reduce training time, but accuracy will drop a little. Disable only if training dataset is very small.",
"default": true
},
"maxNeighbors": {
"type": "integer",
"title": "Max neighbors for indexing",
"description": "If perform ANN, size of the potential neighbors for the indexing phase. Higher value leads to better recall and shorter retrieval times (at the expense of longer indexing time).Reasonable range: 5~100",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"searchNN": {
"type": "integer",
"title": "Search Depth",
"description": "If perform ANN, the depth of search used to find neighbors. Higher value improves recall at the expense of longer retrieval time.Reasonable range: 100~2000",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"indexNN": {
"type": "integer",
"title": "Indexing Depth",
"description": "If perform ANN, the depth of constructed index. Higher value improves recall at the expense of longer indexing time.Reasonable range: 100~2000",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"factors": {
"type": "integer",
"title": "Dimension of latent factors",
"description": "Latent factor dimension used for matrix decomposition. Bigger values require more time and memory but usually provide better results.",
"default": 100,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"epochs": {
"type": "integer",
"title": "Training iterations",
"description": "Number of model training iterations. Model will converge better with larger number at the expense of increased training time. For bigger datasets use smaller values.",
"default": 30,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"learningRate": {
"type": "number",
"title": "Learning rate",
"description": "Model learning rate.",
"default": 0.05,
"hints": [
"advanced"
]
},
"metadataCategoryFields": {
"type": "array",
"title": "Metadata fields for item-item evaluation",
"description": "These fields will be used for item-item evaluation and for determining if the recommendation pair belong to the same category.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"minNumItemUniqueClicks": {
"type": "integer",
"title": "Training Data Filtered By Popular Items",
"description": "Items must have at least this no. of unique user interactions to be included for training and recommendations. The higher this value, the more popular items selected but the amount of training data will reduce.",
"default": 2,
"minimum": 1,
"exclusiveMinimum": false
},
"minNumUserUniqueClicks": {
"type": "integer",
"title": "Training Data Filtered By User clicks",
"description": "Users must have at least this no. of unique item interactions to be included for training and recommendations. The higher this value, the more active users are selected but the amount of training data will reduce.",
"default": 2,
"minimum": 1,
"exclusiveMinimum": false
},
"minNumClickedProducts": {
"type": "integer",
"title": "Minimum Clicked Products",
"description": "Minimum number of clicked products the user should have to be a candidate for the test set.",
"default": 3,
"minimum": 2,
"exclusiveMinimum": false
},
"maxNumTestUsers": {
"type": "integer",
"title": "Maximum Test Users",
"description": "Maximum number of test users to choose. If more users satisfying the Minimum Clicked Products criterion are present, the number will be capped to what is specified here.",
"default": 10000,
"minimum": 0,
"exclusiveMinimum": false
},
"numTestUserClicks": {
"type": "integer",
"title": "Number of User Clicks for Test",
"description": "How many test user clicks to use for testing. Should be less than the value for Minimum Clicked Products.",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"doEvaluation": {
"type": "boolean",
"title": "Evaluate on test data",
"description": "Evaluate how well the trained model predicts user clicks. Test data will be sampled from original dataset."
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-item-recommender-user"
],
"default": "argo-item-recommender-user",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"outputUserRecsCollection",
"outputItemSimCollection",
"outputFormat",
"outputBatchSize",
"secretName",
"partitionFields"
]
},
{
"label": "Training Data Settings",
"properties": [
"trainingDataFilterQuery",
"trainingSampleFraction",
"userIdField",
"itemIdField",
"weightField",
"trainingDataFrameConfigOptions"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"numRecsPerUser",
"numSimsPerItem",
"filterClicked",
"userTopkAnn",
"minNumItemUniqueClicks",
"minNumUserUniqueClicks",
"maxIters",
"deleteOldRecs",
"excludeFromDeleteFilter",
"performANN",
"maxNeighbors",
"searchNN",
"indexNN",
"factors",
"epochs",
"learningRate",
"randomSeed"
]
},
{
"label": "Evaluation Parameters",
"properties": [
"doEvaluation",
"minNumClickedProducts",
"numTestUserClicks",
"maxNumTestUsers"
]
},
{
"label": "Item Metadata Settings",
"properties": [
"itemMetadataCollection",
"itemMetadataFormat",
"itemMetadataJoinField",
"itemMetadataFields",
"metadataCategoryFields"
]
}
]
},
{
"type": "object",
"title": "Query-to-Query Collaborative Similarity (deprecated)",
"description": "Use this job to to batch compute query-query similarities using ALS. Deprecated as of Fusion 5.2.0 and will be removed in a future release; use the Query-to-Query Session Based Similarity job instead.",
"required": [
"id",
"trainingCollection",
"outputQuerySimCollection",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Recommender Model ID",
"description": "Identifier for the recommender model. Will be used as the unique key when storing the model in Solr.",
"hints": [
"advanced"
]
},
"modelCollection": {
"type": "string",
"title": "Model Collection",
"description": "Collection to load and store the computed model (if absent, it won't be loaded or saved)",
"hints": [
"advanced"
]
},
"saveModel": {
"type": "boolean",
"title": "Save Model in Solr",
"description": "Whether we should save the computed ALS model in Solr",
"default": false,
"hints": [
"advanced"
]
},
"trainingCollection": {
"type": "string",
"title": "Recommender Training Collection",
"description": "Item/Query preference collection (often a signals collection or signals aggregation collection)"
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr query to filter training data (e.g. downsampling or selecting based on min. pref values)",
"default": "*:*",
"hints": [
"advanced"
]
},
"popularQueryMin": {
"type": "integer",
"title": "Training Data Filter By Popular Items",
"description": "Items must have at least this # of unique users interacting with it to go into the sample",
"default": 2,
"hints": [
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Downsample preferences for items (bounded to at least 2) by this fraction",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"outputQuerySimCollection": {
"type": "string",
"title": "Query-to-query Similarity Collection",
"description": "Collection to store batch-computed query/query similarities (if absent, none computed)"
},
"outputItemsForQueriesCollection": {
"type": "string",
"title": "Items-for-query Boosting Collection",
"description": "Collection to store batch-computed items-for-queries recommendations (if absent, none computed)"
},
"queryField": {
"type": "string",
"title": "Training Collection Query Field",
"description": "Solr field name containing stored queries",
"default": "query",
"hints": [
"advanced"
]
},
"itemIdField": {
"type": "string",
"title": "Training Collection Item Id Field",
"description": "Solr field name containing stored item ids",
"default": "item_id_s",
"hints": [
"advanced"
]
},
"weightField": {
"type": "string",
"title": "Training Collection Weight Field",
"description": "Solr field name containing stored weights (i.e. time decayed / position weighted counts) the item has for that query",
"default": "weight_d",
"hints": [
"advanced"
]
},
"numSims": {
"type": "integer",
"title": "Number of Query Similarities to Compute",
"description": "Batch compute and store this many query similarities per query",
"default": 10,
"hints": [
"advanced"
]
},
"numItemsPerQuery": {
"type": "integer",
"title": "Number of Items per Query to Recommend",
"description": "Batch compute and store this many item recommendations per query",
"default": 10,
"hints": [
"advanced"
]
},
"initialRank": {
"type": "integer",
"title": "Recommender Rank",
"description": "Number of user/item factors in the recommender decomposition (or starting guess for it, if doing parameter grid search)",
"default": 100,
"hints": [
"advanced"
]
},
"initialBlocks": {
"type": "integer",
"title": "Training Block Size",
"description": "Number of sub-matrix blocks to break the training data into (default: -1, for auto-sizing)",
"default": -1,
"hints": [
"hidden"
]
},
"maxTrainingIterations": {
"type": "integer",
"title": "Maximum Training Iterations",
"description": "Maximum number of iterations to use when learning the matrix decomposition",
"default": 10,
"hints": [
"advanced"
]
},
"initialAlpha": {
"type": "number",
"title": "Implicit Preference Confidence",
"description": "Confidence weight (between 0 and 1) to give the implicit preferences (or starting guess, if doing parameter grid search)",
"default": 0.5,
"hints": [
"advanced"
]
},
"initialLambda": {
"type": "number",
"title": "Smoothing",
"description": "Smoothing parameter to avoid overfitting (or starting guess, if doing parameter grid search). Slightly larger value needed for small data sets",
"default": 0.01,
"hints": [
"advanced"
]
},
"gridSearchWidth": {
"type": "integer",
"title": "Grid Search Width",
"description": "Parameter grid search to be done centered around initial parameter guesses, exponential step size, this number of steps (if <= 0, no grid search)",
"default": 1,
"hints": [
"advanced"
]
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 13,
"hints": [
"advanced"
]
},
"implicitRatings": {
"type": "boolean",
"title": "Implicit Preferences",
"description": "Treat training preferences as implicit signals of interest (i.e. clicks or other actions) as opposed to explicit query ratings",
"default": true
},
"alwaysTrain": {
"type": "boolean",
"title": "Force model re-training",
"description": "Even if a model with this modelId exists, re-train if set true",
"default": true
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"query_similarity"
],
"default": "query_similarity",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingDataFilterQuery",
"modelCollection",
"outputItemsForQueriesCollection",
"outputQuerySimCollection",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingSampleFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"queryField",
"itemIdField",
"weightField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"alwaysTrain",
"saveModel",
"gridSearchWidth",
"implicitRatings",
"initialAlpha",
"initialLambda",
"initialRank",
"maxTrainingIterations",
"numItemsPerQuery",
"numSims",
"popularQueryMin"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Content based Recommender",
"description": "Use this job when you want to compute item similarities based on their content such as product descriptions. ",
"required": [
"id",
"trainingCollection",
"trainingFormat",
"outputCollection",
"outputFormat",
"itemIdField",
"contentField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"outputBatchSize": {
"type": "string",
"title": "Output Batch Size",
"description": "Batch size of documents when pushing results to solr",
"default": "15000",
"hints": [
"advanced"
]
},
"unidecodeText": {
"type": "boolean",
"title": "Unidecode Text",
"description": "Select if you want the text to be unidecoded.",
"default": true
},
"lowercaseText": {
"type": "boolean",
"title": "Lowercase Text",
"description": "Select if you want the text to be lowercased.",
"default": true
},
"vectorizationUseDl": {
"type": "boolean",
"title": "Use Deep Learning for vectorization",
"description": "Select if you want to use deep learning as the method for vectorization. You can choose the other methods too in which case an ensemble will be used.",
"default": true
},
"vectorizationUseFasttext": {
"type": "boolean",
"title": "Use Word2Vec for vectorization",
"description": "Select if you want to use word2vec as the method for vectorization. You can choose the other methods too in which case an ensemble will be used. Custom embeddings will be learned. Useful for jargon."
},
"vectorizationUseTfidf": {
"type": "boolean",
"title": "Use Tf-Idf for vectorization",
"description": "Select if you want to use Tf-idf as the method for vectorization. You can choose the other methods too in which case an ensemble will be used."
},
"vectorizationDlEnsembleWeight": {
"type": "number",
"title": "Deep learning vectorization ensemble weight",
"description": "Ensemble weight for deep learning based vectorization if more than one method of vectorization is selected.",
"default": 1
},
"vectorizationFasttextVectorsSize": {
"type": "integer",
"title": "Size of word vectors",
"description": "Word vector dimensions for Word2Vec vectorizer.",
"default": 150,
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextWindowSize": {
"type": "integer",
"title": "Word2Vec window size",
"description": "The window size (context words from [-window, window]) for Word2Vec.",
"default": 5,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextEpochs": {
"type": "integer",
"title": "Word2Vec training epochs",
"description": "Number of epochs to train custom Word2Vec embeddings.",
"default": 15,
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextSkipGram": {
"type": "boolean",
"title": "Use SkipGram model",
"description": "Whether to use skip gram for training. If unchecked, CBOW will be used.",
"default": true,
"hints": [
"hidden"
]
},
"vectorizationFasttextMinCount": {
"type": "integer",
"title": "Min count of words",
"description": "Minimum times a token needs to occur in the text to be considered for the vocab.",
"default": 1,
"hints": [
"hidden"
],
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextMaxVocabSize": {
"type": "integer",
"title": "Max vocab size",
"description": "Maximum number of tokens to consider for the vocab. Less frequent tokens will be omitted.",
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextUseSubwordNgram": {
"type": "boolean",
"title": "Use subword ngrams",
"description": "Whether to use subword (character) ngrams.",
"default": true,
"hints": [
"hidden"
]
},
"vectorizationFasttextMinNgram": {
"type": "integer",
"title": "Min Ngram size",
"description": "Minimum size for ngrams generated.",
"default": 3,
"hints": [
"hidden"
],
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextMaxNgram": {
"type": "integer",
"title": "Max Ngram size",
"description": "Maximum size for ngrams generated.",
"default": 6,
"hints": [
"hidden"
],
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationFasttextEnsembleWeight": {
"type": "number",
"title": "Word2Vec vectorization ensemble weight",
"description": "Ensemble weight for Fasttext based vectorization if more than one method of vectorization is selected.",
"default": 1
},
"vectorizationTfidfUseCharacters": {
"type": "boolean",
"title": "Use characters ngrams",
"description": "Whether to use characters. By default words are used."
},
"vectorizationTfidfFilterStopwords": {
"type": "boolean",
"title": "Filter stopwords",
"description": "Whether to filter out stopwords before generating Tf-Idf weights.",
"default": true
},
"vectorizationTfidfMinDf": {
"type": "number",
"title": "Min Document Frequency",
"description": "Minimum Df for token to be considered.",
"hints": [
"hidden"
]
},
"vectorizationTfidfMaxDf": {
"type": "number",
"title": "Max Document Frequency",
"description": "Maximum Df for token to be considered.",
"default": 1,
"hints": [
"hidden"
]
},
"vectorizationTfidfMinNgram": {
"type": "integer",
"title": "Min Ngram size",
"description": "Minimum Ngram size to be used.",
"default": 1,
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationTfidfMaxNgram": {
"type": "integer",
"title": "Max Ngram size",
"description": "Maximum Ngram size to be used.",
"default": 3,
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationTfIdfMaxVocabSize": {
"type": "integer",
"title": "Max vocab size",
"description": "Maximum number of tokens to consider for the vocab. Less frequent tokens will be omitted.",
"minimum": 1,
"exclusiveMinimum": false
},
"vectorizationTfidfEnsembleWeight": {
"type": "number",
"title": "Tf-Idf vectorization ensemble weight",
"description": "Ensemble weight for Tf-Idf based vectorization if more than one method of vectorization is selected.",
"default": 1
},
"topKAnn": {
"type": "integer",
"title": "No. of Item Recs to compute for ensemble",
"description": "This is used to fetch additional recommendations so that the value specified for the Number of User Recommendations to Compute is most likely satisfied after filtering. This is normally set to 10 * (No. of item recommendations to compute)",
"default": 100,
"minimum": 1,
"exclusiveMinimum": false
},
"jobRunName": {
"type": "string",
"title": "Job Run Name",
"description": "Identifier for this job run. Use it to filter recommendations from particular runs",
"hints": [
"advanced"
]
},
"trainingCollection": {
"type": "string",
"title": "Training data path",
"description": "Solr collection or cloud storage path where training data is present.",
"minLength": 1
},
"trainingFormat": {
"type": "string",
"title": "Training data format",
"description": "The format of the training data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"secretName": {
"type": "string",
"title": "Cloud storage secret name",
"description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
"hints": [
"advanced"
],
"minLength": 1
},
"outputCollection": {
"type": "string",
"title": "Output data path",
"description": "Solr collection or cloud storage path where output data is to be written."
},
"outputFormat": {
"type": "string",
"title": "Output data format",
"description": "The format of the output data - solr, parquet etc.",
"default": "solr",
"minLength": 1
},
"partitionFields": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"numSimsPerItem": {
"type": "integer",
"title": "No. of Item Recs to Compute",
"description": "Number of recommendations that will be saved per item.",
"default": 10,
"minimum": 1,
"exclusiveMinimum": false
},
"deleteOldRecs": {
"type": "boolean",
"title": "Delete Old Recommendations",
"description": "Should previous recommendations be deleted. If this box is unchecked, then old recommendations will not be deleted but new recommendations will be appended with a different Job ID. Both sets of recommendations will be contained within the same collection. Will only work when output path is solr.",
"default": true
},
"excludeFromDeleteFilter": {
"type": "string",
"title": "Exclude from Delete Filter",
"description": "If the 'Delete Old Recommendations' flag is enabled, then use this query filter to identify existing recommendation docs to exclude from delete. The filter should identify recommendation docs you want to keep.",
"hints": [
"advanced"
]
},
"metadataCategoryFields": {
"type": "array",
"title": "Metadata fields for item-item evaluation",
"description": "These fields will be used for item-item evaluation and for determining if the recommendation pair belongs to the same category.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training Data Filter Query",
"description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`.",
"hints": [
"code/sql",
"advanced"
]
},
"trainingSampleFraction": {
"type": "number",
"title": "Training Data Sampling Fraction",
"description": "Choose a fraction of the data for training.",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"itemIdField": {
"type": "string",
"title": "Training Item Id Field",
"description": "Field name containing stored item ids",
"default": "item_id_s",
"minLength": 1
},
"contentField": {
"type": "array",
"title": "Training Content Field",
"description": "Field name containing item content such as product description",
"items": {
"type": "string"
}
},
"randomSeed": {
"type": "integer",
"title": "Random Seed",
"description": "Pseudorandom determinism fixed by keeping this seed constant",
"default": 12345,
"hints": [
"advanced"
]
},
"itemMetadataFields": {
"type": "array",
"title": "Item Metadata Fields",
"description": "List of item metadata fields to include in the recommendation output documents.",
"hints": [
"advanced"
],
"items": {
"type": "string"
}
},
"vectorizationDlBatchSize": {
"type": "integer",
"title": "Batch size to compute encodings",
"description": "Compute encodings in batches in case hardware out of memory.",
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"performANN": {
"type": "boolean",
"title": "Perform approximate nearest neighbor search",
"description": "Whether to perform approximate nearest neighbor search (ANN). ANN will drastically reduce training time, but accuracy will drop a little. Disable only if dataset is very small.",
"default": true
},
"maxNeighbors": {
"type": "integer",
"title": "Max neighbors for indexing",
"description": "If perform ANN, size of the potential neighbors for the indexing phase. Higher value leads to better recall and shorter retrieval times (at the expense of longer indexing time).Reasonable range: 5~100",
"hints": [
"advanced"
],
"maximum": 100,
"exclusiveMaximum": false,
"minimum": 5,
"exclusiveMinimum": false
},
"searchNN": {
"type": "integer",
"title": "Search Depth",
"description": "If perform ANN, the depth of search used to find neighbors. Higher value improves recall at the expense of longer retrieval time.Reasonable range: 100~2000",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"indexNN": {
"type": "integer",
"title": "Indexing Depth",
"description": "If perform ANN, the depth of constructed index. Higher value improves recall at the expense of longer indexing time.Reasonable range: 100~2000",
"hints": [
"advanced"
],
"maximum": 2000,
"exclusiveMaximum": false,
"minimum": 100,
"exclusiveMinimum": false
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-item-recommender-content"
],
"default": "argo-item-recommender-content",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"trainingFormat",
"outputCollection",
"outputFormat",
"outputBatchSize",
"secretName",
"partitionFields"
]
},
{
"label": "Training Data Settings",
"properties": [
"trainingDataFilterQuery",
"trainingSampleFraction",
"randomSeed",
"itemIdField",
"contentField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"numSimsPerItem",
"topKAnn",
"performANN",
"maxNeighbors",
"searchNN",
"indexNN",
"unidecodeText",
"lowercaseText",
"deleteOldRecs",
"excludeFromDeleteFilter"
]
},
{
"label": "Vectorization Parameters",
"properties": [
"vectorizationUseDl",
"vectorizationUseFasttext",
"vectorizationUseTfidf"
]
},
{
"label": "Deep Learning Vectorization Parameters",
"properties": [
"vectorizationDlBatchSize",
"vectorizationDlEnsembleWeight"
]
},
{
"label": "Word2Vec Vectorization Parameters",
"properties": [
"vectorizationFasttextVectorsSize",
"vectorizationFasttextWindowSize",
"vectorizationFasttextEpochs",
"vectorizationFasttextMinNgram",
"vectorizationFasttextEnsembleWeight",
"vectorizationFasttextMaxVocabSize"
]
},
{
"label": "Tf-Idf Vectorization Parameters",
"properties": [
"vectorizationTfidfUseCharacters",
"vectorizationTfidfFilterStopwords",
"vectorizationTfidfMinNgram",
"vectorizationTfidfMaxNgram",
"vectorizationTfIdfMaxVocabSize",
"vectorizationTfidfEnsembleWeight"
]
},
{
"label": "Item Metadata Settings",
"properties": [
"itemMetadataFields",
"metadataCategoryFields"
]
}
]
},
{
"type": "object",
"title": "Delete Seldon Core Model Deployment",
"description": "Removes a Seldon Core deployment from the cluster",
"required": [
"id",
"modelName",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Seldon Core deployment to delete",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-delete-model"
],
"default": "argo-delete-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
},
{
"type": "object",
"title": "Logistic Regression Classifier Training (deprecated)",
"description": "Use this job when you have training data and you want to train a logistic regression model to classify text into groups. Deprecated as of Fusion 5.2.0 and will be removed in a future release; use the Classification job instead.",
"required": [
"id",
"trainingCollection",
"fieldToVectorize",
"dataFormat",
"trainingLabelField",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Spark Job ID",
"description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Spark Settings",
"description": "Spark configuration settings.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"trainingCollection": {
"type": "string",
"title": "Training Collection",
"description": "Solr Collection containing labeled training data",
"minLength": 1
},
"fieldToVectorize": {
"type": "string",
"title": "Field to Vectorize",
"description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
"minLength": 1
},
"dataFormat": {
"type": "string",
"title": "Data format",
"description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
"default": "solr",
"minLength": 1
},
"trainingDataFrameConfigOptions": {
"type": "object",
"title": "Dataframe Config Options",
"description": "Additional spark dataframe loading configuration options",
"properties": {},
"additionalProperties": {
"type": "string"
},
"hints": [
"advanced"
]
},
"trainingDataFilterQuery": {
"type": "string",
"title": "Training data filter query",
"description": "Solr query to use when loading training data if using Solr",
"default": "*:*",
"hints": [
"advanced"
]
},
"sparkSQL": {
"type": "string",
"title": "Spark SQL filter query",
"description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
"default": "SELECT * from spark_input",
"hints": [
"code/sql",
"advanced"
]
},
"trainingDataSamplingFraction": {
"type": "number",
"title": "Training data sampling fraction",
"description": "Fraction of the training data to use",
"default": 1,
"hints": [
"advanced"
],
"maximum": 1,
"exclusiveMaximum": false
},
"randomSeed": {
"type": "integer",
"title": "Random seed",
"description": "For any deterministic pseudorandom number generation",
"default": 1234,
"hints": [
"advanced"
]
},
"outputCollection": {
"type": "string",
"title": "Output Collection",
"description": "Solr Collection to store model-labeled data to"
},
"overwriteOutput": {
"type": "boolean",
"title": "Overwrite Output",
"description": "Overwrite output collection",
"default": true,
"hints": [
"hidden",
"advanced"
]
},
"dataOutputFormat": {
"type": "string",
"title": "Data output format",
"description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
"default": "solr",
"hints": [
"advanced"
],
"minLength": 1
},
"sourceFields": {
"type": "string",
"title": "Fields to Load",
"description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
"hints": [
"advanced"
]
},
"partitionCols": {
"type": "string",
"title": "Partition fields",
"description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
"hints": [
"advanced"
]
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"modelId": {
"type": "string",
"title": "Model ID",
"description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
"hints": [
"advanced"
],
"minLength": 1
},
"analyzerConfig": {
"type": "string",
"title": "Lucene Analyzer Schema",
"description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
"default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
"hints": [
"advanced",
"code/json",
"lengthy"
]
},
"withIdf": {
"type": "boolean",
"title": "IDF Weighting",
"description": "Weight vector components based on inverse document frequency",
"default": true,
"hints": [
"advanced"
]
},
"w2vDimension": {
"type": "integer",
"title": "Word2Vec Dimension",
"description": "Word-vector dimensionality to represent text (choose > 0 to use)",
"default": 0,
"hints": [
"advanced"
],
"minimum": 0,
"exclusiveMinimum": false
},
"w2vWindowSize": {
"type": "integer",
"title": "Word2Vec Window Size",
"description": "The window size (context words from [-window, window]) for word2vec",
"default": 5,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxSentenceLength": {
"type": "integer",
"title": "Max Word2Vec Sentence Length",
"description": "Sets the maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength` size.",
"default": 1000,
"hints": [
"advanced"
],
"minimum": 3,
"exclusiveMinimum": false
},
"w2vMaxIter": {
"type": "integer",
"title": "Max Word2Vec Iterations",
"description": "Maximum number of iterations of the word2vec training",
"default": 1,
"hints": [
"advanced"
]
},
"w2vStepSize": {
"type": "number",
"title": "Word2Vec Step Size",
"description": "Training parameter for word2vec convergence (change at your own peril)",
"default": 0.025,
"hints": [
"advanced"
],
"minimum": 0.005,
"exclusiveMinimum": false
},
"minDF": {
"type": "number",
"title": "Minimum Term Document Frequency",
"description": "To be kept, terms must occur in at least this number of documents (if > 1.0), or at least this fraction of documents (if <= 1.0)",
"default": 0,
"hints": [
"advanced"
]
},
"maxDF": {
"type": "number",
"title": "Max Term Document Frequency",
"description": "To be kept, terms must occur in no more than this number of documents (if > 1.0), or no more than this fraction of documents (if <= 1.0)",
"default": 1,
"hints": [
"advanced"
]
},
"norm": {
"type": "integer",
"title": "Vector normalization",
"description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
"enum": [
-1,
0,
1,
2
],
"default": 2,
"hints": [
"advanced"
]
},
"predictedLabelField": {
"type": "string",
"title": "Predicted Label Field",
"description": "Solr field which will contain labels when classifier is applied to documents",
"default": "labelPredictedByFusionModel",
"hints": [
"advanced"
]
},
"serializeAsMleap": {
"type": "boolean",
"title": "Serialize as Mleap Bundle",
"description": "Serialize the output model as Mleap Bundle",
"default": true,
"hints": [
"hidden"
]
},
"minSparkPartitions": {
"type": "integer",
"title": "Minimum Number of Spark Partitions",
"description": "Minimum number of Spark partitions for training job.",
"default": 200,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"stopwordsList": {
"type": "array",
"title": "List of stopwords",
"description": "Stopwords defined in Lucene analyzer config",
"hints": [
"readonly",
"hidden"
],
"items": {
"type": "string",
"minLength": 1,
"reference": "blob",
"blobType": "file:spark"
}
},
"overwriteExistingModel": {
"type": "boolean",
"title": "Overwrite existing model",
"description": "If a model exists in the model store, overwrite when this job runs",
"default": true,
"hints": [
"advanced"
]
},
"trainingLabelField": {
"type": "string",
"title": "Label Field",
"description": "Solr field containing labels for training instances (should be single-valued strings)"
},
"gridSearch": {
"type": "boolean",
"title": "Grid Search with Cross Validation",
"description": "Perform grid search to optimize hyperparameters",
"default": false
},
"evaluationMetricType": {
"type": "string",
"title": "Evaluation Metric Type",
"description": "Optimize hyperparameter search over one of [binary, multiclass, regression] metrics, or 'none'",
"enum": [
"binary",
"multiclass",
"regression",
"none"
],
"default": "none",
"hints": [
"advanced"
]
},
"autoBalanceClasses": {
"type": "boolean",
"title": "Auto-balance training classes",
"description": "Ensure that all classes of training data have the same size",
"default": true,
"hints": [
"advanced"
]
},
"minTrainingSamplesPerClass": {
"type": "integer",
"title": "Minimum Labeled Class Size",
"description": "Ensure that all classes of training data have at least this many examples",
"default": 100,
"hints": [
"advanced"
],
"minimum": 1,
"exclusiveMinimum": false
},
"makeOtherClass": {
"type": "boolean",
"title": "Make 'Other' Class",
"description": "Create a label class 'Other' which contains all examples not in a class large enough to train on",
"default": true,
"hints": [
"advanced"
]
},
"otherClassName": {
"type": "string",
"title": "'Other' class name",
"description": "Label class name for the catch-all 'Other' class",
"default": "Other",
"hints": [
"advanced"
],
"minLength": 1
},
"regularizationWeight": {
"type": "number",
"title": "Regularization weight",
"description": "Degree of regularization to use when training (L2 lambda parameter if elasticNetWeight = 0)",
"default": 0.01,
"maximum": 1,
"exclusiveMaximum": false,
"minimum": 0.000001,
"exclusiveMinimum": false
},
"elasticNetWeight": {
"type": "number",
"title": "Elastic net weight",
"description": "Value between 0 and 1 to interpolate between ridge (0.0) and lasso (1.0) regression",
"default": 0,
"maximum": 1,
"exclusiveMaximum": false
},
"maxIters": {
"type": "integer",
"title": "Maximum number of iterations",
"description": "Maximum number of iterations to perform before halting, even if the convergence criterion has not been met.",
"default": 10
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"logistic_regression_classifier_trainer"
],
"default": "logistic_regression_classifier_trainer",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1,
"propertyGroups": [
{
"label": "Input/Output Parameters",
"properties": [
"trainingCollection",
"outputCollection",
"dataFormat",
"trainingDataFilterQuery",
"readOptions",
"writeOptions",
"trainingDataFrameConfigOptions",
"trainingDataSamplingFraction",
"randomSeed"
]
},
{
"label": "Field Parameters",
"properties": [
"fieldToVectorize",
"sourceFields",
"predictedLabelField",
"trainingLabelField"
]
},
{
"label": "Model Tuning Parameters",
"properties": [
"w2vDimension",
"w2vWindowSize",
"w2vMaxIter",
"w2vMaxSentenceLength",
"w2vStepSize",
"withIdf",
"maxDF",
"minDF",
"norm",
"autoBalanceClasses",
"evaluationMetricType",
"minTrainingSamplesPerClass",
"otherClassName",
"makeOtherClass",
"gridSearch",
"elasticNetWeight",
"maxIters",
"regularizationWeight"
]
},
{
"label": "Featurization Parameters",
"properties": [
"analyzerConfig"
]
},
{
"label": "Misc. Parameters",
"properties": [
"modelId"
]
}
]
},
{
"type": "object",
"title": "Create Ray Model Deployment",
"description": "Deploys a Ray Model into the Fusion cluster",
"required": [
"id",
"deployModelName",
"modelCpuLimit",
"modelMemoryLimit",
"modelDockerRepo",
"modelDockerImage",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Job ID",
"description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
"maxLength": 63,
"pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
},
"sparkConfig": {
"type": "array",
"title": "Additional parameters",
"description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"writeOptions": {
"type": "array",
"title": "Write Options",
"description": "Options used when writing output to Solr or other sources",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"readOptions": {
"type": "array",
"title": "Read Options",
"description": "Options used when reading input from Solr or other sources.",
"hints": [
"advanced"
],
"items": {
"type": "object",
"required": [
"key"
],
"properties": {
"key": {
"type": "string",
"title": "Parameter Name"
},
"value": {
"type": "string",
"title": "Parameter Value"
}
}
}
},
"deployModelName": {
"type": "string",
"title": "Model name",
"description": "The model name of the Ray deployment to deploy (must be a valid lowercased DNS subdomain with no underscores).",
"maxLength": 30,
"pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"
},
"modelMinReplicas": {
"type": "integer",
"title": "Model min replicas",
"description": "Minimum number of replicas of the model to be deployed",
"default": 1
},
"modelMaxReplicas": {
"type": "integer",
"title": "Model max replicas",
"description": "Maximum number of replicas of the model to be deployed",
"default": 1
},
"modelCpuLimit": {
"type": "number",
"title": "Model CPU limit",
"description": "Maximum number of CPUs that can be allocated to a single model replica",
"default": 1
},
"modelMemoryLimit": {
"type": "string",
"title": "Model memory limit",
"description": "Maximum amount of memory that can be allocated to a single model replica",
"default": "1Gi",
"pattern": "^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$"
},
"modelImportPath": {
"type": "string",
"title": "Ray deployment import path",
"description": "The path to your top-level Ray Serve deployment (or the same path passed to `serve run`)",
"default": "deployment:app"
},
"modelDockerRepo": {
"type": "string",
"title": "Docker repository",
"description": "Defines the Docker repository where the model image is located."
},
"modelDockerImage": {
"type": "string",
"title": "Image name",
"description": "Name of the model's docker image"
},
"modelDockerSecret": {
"type": "string",
"title": "Kubernetes secret name for model repo",
"description": "Defines the Kubernetes secret to be used with the Docker repository"
},
"type": {
"type": "string",
"title": "Spark Job Type",
"enum": [
"argo-deploy-ray-model"
],
"default": "argo-deploy-ray-model",
"hints": [
"readonly"
]
}
},
"additionalProperties": true,
"category": "Other",
"categoryPriority": 1
}
]
}Basic authentication header of the form Basic <encoded-value>, where <encoded-value> is the base64-encoded string username:password.
OK
string, number, integer, boolean, object, array, null, ref Show child attributes
Show child attributes
Show child attributes
Show child attributes
Was this page helpful?