> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Outlier Detection

> Job configuration specifications

export const schema = {
  "type": "object",
  "title": "Outlier Detection",
  "description": "Use this job when you want to find outliers from a set of documents and attach labels for each outlier group.",
  "required": ["id", "trainingCollection", "fieldToVectorize", "dataFormat", "uidField", "outputCollection", "type"],
  "properties": {
    "id": {
      "type": "string",
      "title": "Spark Job ID",
      "description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
      "maxLength": 63,
      "pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
    },
    "sparkConfig": {
      "type": "array",
      "title": "Spark Settings",
      "description": "Spark configuration settings.",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "trainingCollection": {
      "type": "string",
      "title": "Training Collection",
      "description": "Solr Collection containing documents to be clustered",
      "minLength": 1
    },
    "fieldToVectorize": {
      "type": "string",
      "title": "Field to Vectorize",
      "description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
      "minLength": 1
    },
    "dataFormat": {
      "type": "string",
      "title": "Data format",
      "description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
      "default": "solr",
      "minLength": 1
    },
    "trainingDataFrameConfigOptions": {
      "type": "object",
      "title": "Dataframe Config Options",
      "description": "Additional spark dataframe loading configuration options",
      "properties": {},
      "additionalProperties": {
        "type": "string"
      },
      "hints": ["advanced"]
    },
    "trainingDataFilterQuery": {
      "type": "string",
      "title": "Training data filter query",
      "description": "Solr query to use when loading training data if using Solr",
      "default": "*:*",
      "hints": ["advanced"]
    },
    "sparkSQL": {
      "type": "string",
      "title": "Spark SQL filter query",
      "description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
      "default": "SELECT * from spark_input",
      "hints": ["code/sql", "advanced"]
    },
    "trainingDataSamplingFraction": {
      "type": "number",
      "title": "Training data sampling fraction",
      "description": "Fraction of the training data to use",
      "default": 1,
      "hints": ["advanced"],
      "maximum": 1,
      "exclusiveMaximum": false
    },
    "randomSeed": {
      "type": "integer",
      "title": "Random seed",
      "description": "For any deterministic pseudorandom number generation",
      "default": 1234,
      "hints": ["advanced"]
    },
    "outputCollection": {
      "type": "string",
      "title": "Output Collection",
      "description": "Solr Collection to store model-labeled data to",
      "minLength": 1
    },
    "overwriteOutput": {
      "type": "boolean",
      "title": "Overwrite Output",
      "description": "Overwrite output collection",
      "default": true,
      "hints": ["hidden", "advanced"]
    },
    "dataOutputFormat": {
      "type": "string",
      "title": "Data output format",
      "description": "Spark-compatible output format (like 'solr', 'parquet', etc)",
      "default": "solr",
      "hints": ["advanced"],
      "minLength": 1
    },
    "sourceFields": {
      "type": "string",
      "title": "Fields to Load",
      "description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
      "hints": ["advanced"]
    },
    "partitionCols": {
      "type": "string",
      "title": "Partition fields",
      "description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
      "hints": ["advanced"]
    },
    "writeOptions": {
      "type": "array",
      "title": "Write Options",
      "description": "Options used when writing output to Solr or other sources",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "readOptions": {
      "type": "array",
      "title": "Read Options",
      "description": "Options used when reading input from Solr or other sources.",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "modelId": {
      "type": "string",
      "title": "Model ID",
      "description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
      "hints": ["advanced"],
      "minLength": 1
    },
    "outlierGroupIdField": {
      "type": "string",
      "title": "Output Field Name for Outlier Group Id",
      "description": "Output field name for unique outlier group id.",
      "default": "outlier_group_id"
    },
    "outlierGroupLabelField": {
      "type": "string",
      "title": "Top Unique Terms Field Name",
      "description": "Output field name for top frequent terms that are (mostly) unique for each outlier group as computed based on TF-IDF and group Id.",
      "default": "outlier_group_label"
    },
    "outputOutliersOnly": {
      "type": "boolean",
      "title": "Only save outliers?",
      "description": "If true, only outliers are saved in the output collection, otherwise, the whole dataset is saved.",
      "default": false
    },
    "uidField": {
      "type": "string",
      "title": "ID Field Name",
      "description": " Field containing the unique ID for each document.",
      "default": "id",
      "minLength": 1
    },
    "analyzerConfig": {
      "type": "string",
      "title": "Lucene Analyzer Schema",
      "description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
      "default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
      "hints": ["lengthy", "code/json"],
      "minLength": 1
    },
    "freqTermField": {
      "type": "string",
      "title": "Top Frequent Terms Field Name",
      "description": "Output field name for top frequent terms in each cluster. These may overlap with other clusters.",
      "default": "freq_terms"
    },
    "distToCenterField": {
      "type": "string",
      "title": "Output Field Name for doc distance to its cluster center",
      "description": "Output field name for doc distance to its corresponding cluster center (measure how representative the doc is).",
      "default": "dist_to_center"
    },
    "norm": {
      "type": "integer",
      "title": "Vector normalization",
      "description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
      "enum": [-1, 0, 1, 2],
      "default": 2,
      "hints": ["advanced"]
    },
    "minDF": {
      "type": "number",
      "title": "Min Doc Support",
      "description": "Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
      "default": 5
    },
    "maxDF": {
      "type": "number",
      "title": "Max Doc Support",
      "description": "Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
      "default": 0.75
    },
    "numKeywordsPerLabel": {
      "type": "integer",
      "title": "Number of Keywords for Each Cluster",
      "description": "Number of Keywords needed for labeling each cluster.",
      "default": 5
    },
    "outlierK": {
      "type": "integer",
      "title": "Number of outlier groups",
      "description": "Number of clusters to help find outliers.",
      "default": 10,
      "hints": ["advanced"]
    },
    "outlierThreshold": {
      "type": "number",
      "title": "Outlier cutoff",
      "description": "Identify as outlier group if less than this percent of total documents. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
      "default": 0.01,
      "hints": ["advanced"]
    },
    "stopwordsList": {
      "type": "array",
      "title": "List of stopwords",
      "description": "Stopwords defined in Lucene analyzer config",
      "hints": ["readonly", "hidden"],
      "items": {
        "type": "string",
        "minLength": 1,
        "reference": "blob",
        "blobType": "file:spark"
      }
    },
    "type": {
      "type": "string",
      "title": "Spark Job Type",
      "enum": ["outlier_detection"],
      "default": "outlier_detection",
      "hints": ["readonly"]
    }
  },
  "additionalProperties": true,
  "category": "Other",
  "categoryPriority": 1,
  "propertyGroups": [{
    "label": "Input/Output Parameters",
    "properties": ["trainingCollection", "outputCollection", "dataFormat", "trainingDataFilterQuery", "readOptions", "writeOptions", "trainingDataFrameConfigOptions", "trainingDataSamplingFraction", "randomSeed", "outputOutliersOnly"]
  }, {
    "label": "Field Parameters",
    "properties": ["fieldToVectorize", "sourceFields", "uidField", "outlierGroupIdField", "outlierGroupLabelField", "freqTermField", "distToCenterField"]
  }, {
    "label": "Model Tuning Parameters",
    "properties": ["outlierK", "outlierThreshold", "maxDF", "minDF", "norm", "numKeywordsPerLabel"]
  }, {
    "label": "Featurization Parameters",
    "properties": ["analyzerConfig"]
  }, {
    "label": "Misc. Parameters",
    "properties": ["modelId"]
  }]
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/lucidworks-search/09-developer-documentation/config-specs/jobs/outlier-detection

[mintlify link]: https://doc.lucidworks.com/docs/lucidworks-search/09-developer-documentation/config-specs/jobs/outlier-detection

[old doc.lw link]: https://doc.lucidworks.com/managed-fusion/5.9/mbp0cy

Outlier detection jobs are run against your data collections, and also perform the following actions:

* Identify information that significantly differs from other data in the collection
* Attach labels to designate each outlier group

To create an Outlier Detection job, sign in to Lucidworks Search and click **Collections > Jobs**. Then click **Add+** and in the Clustering and Outlier Analysis Jobs section, select **Outlier Detection**. You can enter basic and advanced parameters to configure the job. If the field has a default value, it is populated when you click to add the job.

<LwTemplate />

## Basic parameters

<Note>
  To enter advanced parameters in the UI, click **Advanced**. Those parameters are described in [the advanced parameters section](#advanced-parameters).
</Note>

* **Spark job ID.** The unique ID for the Spark job that references this job in the API. This is the `id` field in the configuration file. Required field.
* **Input/Output Parameters.** This section includes these parameters:

  * **Training collection.** The Solr collection that contains documents that will be clustered. The job will be run against this information. This is the `trainingCollection` field in the configuration file. Required field.
  * **Output collection.** The Solr collection where the job output is stored. The job will write the output to this collection. This is the `outputCollection` field in the configuration file. Required field.
  * **Data format.** The format that contains training data. The format must be compatible with Spark and options include `solr`, `parquet`, and `orc`. This is the `dataFormat` field in the configuration file. Required field.
* **Only save outliers?** If this checkbox is selected (set to `true`), only outliers are saved in the job’s output collection. If not selected (set to `false`), the entire dataset is saved in the job’s output collection. This is the `outputOutliersOnly` field in the configuration file. Optional field.
* **Field Parameters.** This section includes these parameters:

  * **Field to vectorize.** The Solr field that contains text training data. To combine data from multiple fields with different weights, enter `field1:weight1`,`field2:weight2`, etc. This is the `fieldToVectorize` field in the configuration file. Required field.
  * **ID field name.** The unique ID for each document. This is the `uidField` field in the configuration file. Required field.
  * **Output field name for outlier group ID.** The field that contains the ID for the outlier group. This is the `outlierGroupIdField` field in the configuration file. Optional field.
  * **Top unique terms field name.** The field where the job output stores the top frequent terms that, for the most part, are unique for each outlier group. The information is computed based on term frequency-inverse document frequency (TF-IDF) and group ID. This is the `outlierGroupLabelField` field in the configuration file. Optional field.
  * **Top frequent terms field name.** The field where the job output stores top frequent terms in each cluster. Terms may overlap with other clusters. This is the `freqTermField` field in the configuration file. Optional field.
  * **Output field name for doc distance to its corresponding cluster center.** The field that contains the document’s distance from the center of its cluster. This is based on the arithmetic mean of all of the documents in the cluster. This denotes how representative the document is in the cluster. This is the `distToCenterField` field in the configuration file. Optional field.
* **Model Tuning Parameters.** This section includes these parameters:

  * **Max doc support.** The maximum number of documents that can contain the term. Values that are `<1.0` indicate a percentage, `1.0` is 100 percent, and `>1.0` indicates the exact number. This is the `maxDF` field in the configuration file. Optional field.
  * **Min doc support.** The minimum number of documents that must contain the term. Values that are `<1.0` indicate a percentage, `1.0` is 100 percent, and `>1.0` indicates the exact number. This is the `minDF` field in the configuration file. Optional field.
  * **Number of keywords for each cluster.** The number of keywords required to label each cluster. This is the `numKeywordsPerLabel` field in the configuration file. Optional field.
* **Featurization Parameters.** This section includes the following parameter:

  * **Lucene analyzer schema.** This is the JSON-encoded Lucene text analyzer schema used for tokenization. This is the `analyzerConfig` field in the configuration file. Optional field.

## Advanced parameters

If you click the **Advanced** toggle, the following optional fields are displayed in the UI.

* **Spark Settings.** The Spark configuration settings include the following:

  * **Spark SQL filter query.** This field contains the Spark SQL query that filters your input data. For example, `SELECT * from spark_input` registers the input data as `spark_input`. This is the `sparkSQL` field in the configuration file.
  * **Data output format.** The format for the job output. The format must be compatible with Spark and options include `solr` and `parquet`. This is the `dataOutputFormat` field in the configuration file.
  * **Partition fields.** If the job output is written to non-Solr sources, this field contains a comma-delimited list of column names that partition the dataframe before the external output is written. This is the `partitionCols` field in the configuration file.
* **Input/Output Parameters.** This advanced option adds these parameters:

  * **Training data filter query.** If Solr is used, this field contains the Solr query executed to load training data. This is the `trainingDataFilterQuery` field in the configuration file.
* **Read Options.** This section lets you enter `parameter name:parameter value` options to use when reading input from Solr or other sources. This is the `readOptions` field in the configuration file.
* **Write Options.** This section lets you enter `parameter name:parameter value` options to use when writing output to Solr or other sources. This is the `writeOptions` field in the configuration file.
* **Dataframe config options.** This section includes these parameters:

  * **Property name:property value.** Each entry defines an additional Spark dataframe loading configuration option. This is the `trainingDataFrameConfigOptions` field in the configuration file.
  * **Training data sampling fraction.** This is the fractional amount of the training data the job will use. This is the `trainingDataSamplingFraction` field in the configuration file.
  * **Random seed.** This value is used in any deterministic pseudorandom number generation to group documents into clusters based on similarities in their content. This is the `randomSeed` field in the configuration file.
* **Field Parameters.** The advanced option adds this parameter:

  * **Fields to load.** This field contains a comma-delimited list of Solr fields to load. If blank, the job selects the required fields to load at runtime. This is the `sourceFields` field in the configuration file.
* **Model Tuning Parameters.** The advanced option adds these parameters:

  * **Number of outlier groups.** The number of clusters to help find outliers. This is the `outlierK` field in the configuration file.
  * **Outlier cutoff.** The fraction out of the total documents to designate as an outlier group. Values that are `<1.0` indicate a percentage, `1.0` is 100 percent, and `>1.0` indicates the exact number. This is the `outlierThreshold` field in the configuration file.
  * **Vector normalization.** The p-norm value used to normalize vectors. A value of `-1` turns off normalization. This is the `norm` field in the configuration file.
* **Miscellaneous Parameters.** This section includes this parameter:

  * **Model ID.** The unique identifier for the model to be trained. If no value is entered, the `Spark Job ID` is used. This is the `modelId` field in the configuration file.

## Configuration properties

<SchemaParamFields schema={schema} />
