> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Document Clustering Jobs

export const schema = {
  "type": "object",
  "title": "Document Clustering",
  "description": "Use this job when you want to cluster a set of documents and attach cluster labels based on topics.",
  "required": ["id", "trainingCollection", "fieldToVectorize", "outputCollection", "uidField", "type"],
  "properties": {
    "id": {
      "type": "string",
      "title": "Spark Job ID",
      "description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
      "maxLength": 128,
      "pattern": "^[A-Za-z0-9_\\-]+$"
    },
    "trainingCollection": {
      "type": "string",
      "title": "Training Collection",
      "description": "Solr Collection containing documents to be clustered",
      "minLength": 1
    },
    "fieldToVectorize": {
      "type": "string",
      "title": "Field to Vectorize",
      "description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
      "minLength": 1
    },
    "dataFormat": {
      "type": "string",
      "title": "Data format",
      "description": "Spark-compatible format which training data comes in (like 'solr', 'hdfs', 'file', 'parquet' etc)",
      "enum": ["solr", "hdfs", "file", "parquet"],
      "default": "solr",
      "hints": ["advanced"]
    },
    "trainingDataFrameConfigOptions": {
      "type": "object",
      "title": "Dataframe Config Options",
      "description": "Additional spark dataframe loading configuration options",
      "properties": {},
      "additionalProperties": {
        "type": "string"
      },
      "hints": ["advanced"]
    },
    "trainingDataFilterQuery": {
      "type": "string",
      "title": "Training data filter query",
      "description": "Solr query to use when loading training data",
      "default": "*:*",
      "hints": ["advanced"],
      "minLength": 3
    },
    "trainingDataSamplingFraction": {
      "type": "number",
      "title": "Training data sampling fraction",
      "description": "Fraction of the training data to use",
      "default": 1,
      "hints": ["advanced"],
      "maximum": 1,
      "exclusiveMaximum": false
    },
    "randomSeed": {
      "type": "integer",
      "title": "Random seed",
      "description": "For any deterministic pseudorandom number generation",
      "default": 1234,
      "hints": ["advanced"]
    },
    "outputCollection": {
      "type": "string",
      "title": "Output Collection",
      "description": "Solr Collection to store model-labeled data to",
      "minLength": 1
    },
    "overwriteOutput": {
      "type": "boolean",
      "title": "Overwrite Output",
      "description": "Overwrite output collection",
      "default": true,
      "hints": ["hidden", "advanced"]
    },
    "sourceFields": {
      "type": "string",
      "title": "Fields to Load",
      "description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
      "hints": ["advanced"]
    },
    "uidField": {
      "type": "string",
      "title": "ID Field Name",
      "description": " Field containing the unique ID for each document.",
      "default": "id",
      "minLength": 1
    },
    "clusterIdField": {
      "type": "string",
      "title": "Output Field Name for Cluster Id",
      "description": "Output field name for unique cluster id.",
      "default": "cluster_id"
    },
    "clusterLabelField": {
      "type": "string",
      "title": "Detected Cluster Keywords Field Name",
      "description": "Output field name for top frequent terms that are (mostly) unique for each cluster.",
      "default": "cluster_label"
    },
    "freqTermField": {
      "type": "string",
      "title": "Top Frequent Terms Field Name",
      "description": "Output field name for top frequent terms in each cluster. These may overlap with other clusters.",
      "default": "freq_terms"
    },
    "distToCenterField": {
      "type": "string",
      "title": "Output Field Name for doc distance to its cluster center",
      "description": "Output field name for doc distance to its corresponding cluster center (measure how representative the doc is).",
      "default": "dist_to_center"
    },
    "minDF": {
      "type": "number",
      "title": "Min Doc Support",
      "description": "Min number of documents the term has to show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
      "default": 5
    },
    "maxDF": {
      "type": "number",
      "title": "Max Doc Support",
      "description": "Max number of documents the term can show up. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number.",
      "default": 0.5
    },
    "kExact": {
      "type": "integer",
      "title": "Number of Clusters",
      "description": "Exact number of clusters.",
      "default": 0
    },
    "kMax": {
      "type": "integer",
      "title": "Max Possible Number of Clusters",
      "description": "Max possible number of clusters.",
      "default": 20
    },
    "kMin": {
      "type": "integer",
      "title": "Min Possible Number of Clusters",
      "description": "Min possible number of clusters.",
      "default": 2
    },
    "docLenTrim": {
      "type": "boolean",
      "title": "Find Extreme Length Doc Flag",
      "description": " Whether to separate out docs with extreme lengths.",
      "default": true
    },
    "outlierTrim": {
      "type": "boolean",
      "title": "Find Outliers Flag",
      "description": " Whether to perform outlier detection.",
      "default": true
    },
    "shortLen": {
      "type": "number",
      "title": "Length Threshold for Short Doc",
      "description": "Length threshold to define short document. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
      "default": 5
    },
    "longLen": {
      "type": "number",
      "title": "Length Threshold for Long Doc",
      "description": "Length threshold to define long document. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
      "default": 0.99
    },
    "numKeywordsPerLabel": {
      "type": "integer",
      "title": "Number of Keywords for Each Cluster",
      "description": "Number of Keywords needed for labeling each cluster.",
      "default": 5
    },
    "modelId": {
      "type": "string",
      "title": "Model ID",
      "description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
      "hints": ["advanced"],
      "minLength": 1
    },
    "w2vDimension": {
      "type": "integer",
      "title": "Word2Vec Dimension",
      "description": "Word-vector dimensionality to represent text (choose > 0 to use, suggested dimension ranges: 100~150)",
      "default": 0,
      "hints": ["advanced"],
      "minimum": 0,
      "exclusiveMinimum": false
    },
    "w2vWindowSize": {
      "type": "integer",
      "title": "Word2Vec Window Size",
      "description": "The window size (context words from [-window, window]) for word2vec",
      "default": 8,
      "hints": ["advanced"],
      "minimum": 3,
      "exclusiveMinimum": false
    },
    "norm": {
      "type": "integer",
      "title": "Vector normalization",
      "description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
      "enum": [-1, 0, 1, 2],
      "default": 2,
      "hints": ["advanced"]
    },
    "analyzerConfig": {
      "type": "string",
      "title": "Lucene Analyzer Schema",
      "description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
      "default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"patternreplace\", \"pattern\": \"^[\\\\d.]+$\", \"replacement\": \" \", \"replace\": \"all\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
      "hints": ["code/json", "advanced", "lengthy"],
      "minLength": 1
    },
    "clusteringMethod": {
      "type": "string",
      "title": "Clustering Method (hierarchical or kmeans)",
      "description": "Choose between hierarchical vs kmeans clustering.",
      "default": "hierarchical",
      "hints": ["advanced"]
    },
    "outlierK": {
      "type": "integer",
      "title": "Number of outlier groups",
      "description": "Number of clusters to help find outliers.",
      "default": 10,
      "hints": ["advanced"]
    },
    "outlierThreshold": {
      "type": "number",
      "title": "Outlier cutoff",
      "description": "Identify as outlier group if less than this percent of total documents. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
      "default": 0.01,
      "hints": ["advanced"]
    },
    "minDivisibleSize": {
      "type": "number",
      "title": "Minimum divisible cluster size",
      "description": "Clusters must have at least this many documents to be split further. value<1.0 denotes a percentage, value=1.0 denotes 100%, value>1.0 denotes the exact number. ",
      "default": 0,
      "hints": ["advanced"]
    },
    "kDiscount": {
      "type": "number",
      "title": "Discount for K when choosing number of clusters",
      "description": "Applies a discount to help favor large/small K (number of clusters). A smaller value pushes K to assume a higher value within the [min, max] K range.",
      "default": 0.7,
      "hints": ["advanced"]
    },
    "stopwordsList": {
      "type": "array",
      "title": "List of stopwords",
      "description": "Stopwords defined in Lucene analyzer config",
      "hints": ["readonly", "hidden"],
      "items": {
        "type": "string",
        "minLength": 1,
        "reference": "blob",
        "blobType": "file:spark"
      }
    },
    "type": {
      "type": "string",
      "title": "Spark Job Type",
      "enum": ["doc_clustering"],
      "default": "doc_clustering",
      "hints": ["readonly"]
    }
  },
  "additionalProperties": true,
  "category": "Other",
  "categoryPriority": 1,
  "unsafe": false,
  "propertyGroups": [{
    "label": "Input/Output Parameters",
    "properties": ["trainingCollection", "outputCollection", "dataFormat", "trainingDataFilterQuery", "trainingDataFrameConfigOptions", "trainingDataSamplingFraction", "randomSeed"]
  }, {
    "label": "Field Parameters",
    "properties": ["fieldToVectorize", "sourceFields", "uidField", "clusterIdField", "freqTermField", "clusterLabelField", "distToCenterField"]
  }, {
    "label": "Model Tuning Parameters",
    "properties": ["clusteringMethod", "outlierTrim", "outlierK", "outlierThreshold", "kExact", "kMax", "kMin", "w2vDimension", "w2vWindowSize", "maxDF", "minDF", "norm", "numKeywordsPerLabel", "minDivisibleSize", "kDiscount"]
  }, {
    "label": "Featurization Parameters",
    "properties": ["analyzerConfig", "docLenTrim", "longLen", "shortLen"]
  }, {
    "label": "Misc. Parameters",
    "properties": ["modelId"]
  }]
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-ai/reference/jobs/document-clustering

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-ai/reference/jobs/document-clustering

[old doc.lw link]: https://doc.lucidworks.com/fusion/5.9/582

Cluster a set of documents and attach cluster labels.

<LwTemplate />

<SchemaParamFields schema={schema} />
