> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Random Forest Classifier Training Jobs

export const schema = {
  "type": "object",
  "title": "Random Forest Classifier Training",
  "description": "Use this job when you have training data and you want to train a random forest model to classify text into groups.",
  "required": ["id", "trainingCollection", "fieldToVectorize", "trainingLabelField", "type"],
  "properties": {
    "id": {
      "type": "string",
      "title": "Spark Job ID",
      "description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
      "maxLength": 128,
      "pattern": "^[A-Za-z0-9_\\-]+$"
    },
    "trainingCollection": {
      "type": "string",
      "title": "Training Collection",
      "description": "Solr Collection containing labeled training data",
      "minLength": 1
    },
    "fieldToVectorize": {
      "type": "string",
      "title": "Field to Vectorize",
      "description": "Solr field containing text training data. Data from multiple fields with different weights can be combined by specifying them as field1:weight1,field2:weight2 etc.",
      "minLength": 1
    },
    "dataFormat": {
      "type": "string",
      "title": "Data format",
      "description": "Spark-compatible format which training data comes in (like 'solr', 'hdfs', 'file', 'parquet' etc)",
      "enum": ["solr", "hdfs", "file", "parquet"],
      "default": "solr",
      "hints": ["advanced"]
    },
    "trainingDataFrameConfigOptions": {
      "type": "object",
      "title": "Dataframe Config Options",
      "description": "Additional spark dataframe loading configuration options",
      "properties": {},
      "additionalProperties": {
        "type": "string"
      },
      "hints": ["advanced"]
    },
    "trainingDataFilterQuery": {
      "type": "string",
      "title": "Training data filter query",
      "description": "Solr query to use when loading training data",
      "default": "*:*",
      "hints": ["advanced"],
      "minLength": 3
    },
    "trainingDataSamplingFraction": {
      "type": "number",
      "title": "Training data sampling fraction",
      "description": "Fraction of the training data to use",
      "default": 1,
      "hints": ["advanced"],
      "maximum": 1,
      "exclusiveMaximum": false
    },
    "randomSeed": {
      "type": "integer",
      "title": "Random seed",
      "description": "For any deterministic pseudorandom number generation",
      "default": 1234,
      "hints": ["advanced"]
    },
    "outputCollection": {
      "type": "string",
      "title": "Output Collection",
      "description": "Solr Collection to store model-labeled data to"
    },
    "overwriteOutput": {
      "type": "boolean",
      "title": "Overwrite Output",
      "description": "Overwrite output collection",
      "default": true,
      "hints": ["hidden", "advanced"]
    },
    "sourceFields": {
      "type": "string",
      "title": "Fields to Load",
      "description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
      "hints": ["advanced"]
    },
    "modelId": {
      "type": "string",
      "title": "Model ID",
      "description": "Identifier for the model to be trained; uses the supplied Spark Job ID if not provided.",
      "hints": ["advanced"],
      "minLength": 1
    },
    "analyzerConfig": {
      "type": "string",
      "title": "Lucene Analyzer Schema",
      "description": "LuceneTextAnalyzer schema for tokenization (JSON-encoded)",
      "default": "{ \"analyzers\": [{ \"name\": \"StdTokLowerStop\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"standard\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"KStem\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"fusionstop\", \"ignoreCase\": \"true\", \"format\": \"snowball\", \"words\": \"org/apache/lucene/analysis/snowball/english_stop.txt\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"StdTokLowerStop\" } ]}",
      "hints": ["advanced", "code/json", "lengthy"]
    },
    "withIdf": {
      "type": "boolean",
      "title": "IDF Weighting",
      "description": "Weight vector components based on inverse document frequency",
      "default": true,
      "hints": ["advanced"]
    },
    "w2vDimension": {
      "type": "integer",
      "title": "Word2Vec Dimension",
      "description": "Word-vector dimensionality to represent text (choose > 0 to use)",
      "default": 0,
      "hints": ["advanced"],
      "minimum": 0,
      "exclusiveMinimum": false
    },
    "w2vWindowSize": {
      "type": "integer",
      "title": "Word2Vec Window Size",
      "description": "The window size (context words from [-window, window]) for word2vec",
      "default": 5,
      "hints": ["advanced"],
      "minimum": 3,
      "exclusiveMinimum": false
    },
    "w2vMaxSentenceLength": {
      "type": "integer",
      "title": "Max Word2Vec Sentence Length",
      "description": "Sets the maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks of up to `maxSentenceLength` size.",
      "default": 1000,
      "hints": ["advanced"],
      "minimum": 3,
      "exclusiveMinimum": false
    },
    "w2vMaxIter": {
      "type": "integer",
      "title": "Max Word2Vec Iterations",
      "description": "Maximum number of iterations of the word2vec training",
      "default": 1,
      "hints": ["advanced"]
    },
    "w2vStepSize": {
      "type": "number",
      "title": "Word2Vec Step Size",
      "description": "Training parameter for word2vec convergence (change at your own peril)",
      "default": 0.025,
      "hints": ["advanced"],
      "minimum": 0.005,
      "exclusiveMinimum": false
    },
    "minDF": {
      "type": "number",
      "title": "Minimum Term Document Frequency",
      "description": "To be kept, terms must occur in at least this number of documents (if > 1.0), or at least this fraction of documents (if <= 1.0)",
      "default": 0,
      "hints": ["advanced"]
    },
    "maxDF": {
      "type": "number",
      "title": "Max Term Document Frequency",
      "description": "To be kept, terms must occur in no more than this number of documents (if > 1.0), or no more than this fraction of documents (if <= 1.0)",
      "default": 1,
      "hints": ["advanced"]
    },
    "norm": {
      "type": "integer",
      "title": "Vector normalization",
      "description": "p-norm to normalize vectors with (choose -1 to turn normalization off)",
      "enum": [-1, 0, 1, 2],
      "default": 2,
      "hints": ["advanced"]
    },
    "predictedLabelField": {
      "type": "string",
      "title": "Predicted Label Field",
      "description": "Solr field which will contain labels when classifier is applied to documents",
      "default": "labelPredictedByFusionModel",
      "hints": ["advanced"]
    },
    "serializeAsMleap": {
      "type": "boolean",
      "title": "Serialize as Mleap Bundle",
      "description": "Serialize the output model as Mleap Bundle",
      "default": true,
      "hints": ["hidden"]
    },
    "minSparkPartitions": {
      "type": "integer",
      "title": "Minimum Number of Spark Partitions",
      "description": "Minimum number of Spark partitions for training job.",
      "default": 200,
      "hints": ["advanced"],
      "minimum": 1,
      "exclusiveMinimum": false
    },
    "stopwordsList": {
      "type": "array",
      "title": "List of stopwords",
      "description": "Stopwords defined in Lucene analyzer config",
      "hints": ["readonly", "hidden"],
      "items": {
        "type": "string",
        "minLength": 1,
        "reference": "blob",
        "blobType": "file:spark"
      }
    },
    "trainingLabelField": {
      "type": "string",
      "title": "Label Field",
      "description": "Solr field containing labels for training instances (should be single-valued strings)"
    },
    "gridSearch": {
      "type": "boolean",
      "title": "Grid Search with Cross Validation",
      "description": "Perform grid search to optimize hyperparameters",
      "default": false
    },
    "evaluationMetricType": {
      "type": "string",
      "title": "Evaluation Metric Type",
      "description": "Optimize hyperparameter search over one of [binary, multiclass, regression] metrics, or 'none'",
      "enum": ["binary", "multiclass", "regression", "none"],
      "default": "none",
      "hints": ["advanced"]
    },
    "autoBalanceClasses": {
      "type": "boolean",
      "title": "Auto-balance training classes",
      "description": "Ensure that all classes of training data have the same size",
      "default": true,
      "hints": ["advanced"]
    },
    "minTrainingSamplesPerClass": {
      "type": "integer",
      "title": "Minimum Labeled Class Size",
      "description": "Ensure that all classes of training data have at least this many examples",
      "default": 100,
      "hints": ["advanced"],
      "minimum": 1,
      "exclusiveMinimum": false
    },
    "makeOtherClass": {
      "type": "boolean",
      "title": "Make 'Other' Class",
      "description": "Create a label class 'Other' which contains all examples not in a class large enough to train on",
      "default": true,
      "hints": ["advanced"]
    },
    "otherClassName": {
      "type": "string",
      "title": "'Other' class name",
      "description": "Label class name for the catch-all 'Other' class",
      "default": "Other",
      "hints": ["advanced"],
      "minLength": 1
    },
    "maxDepth": {
      "type": "integer",
      "title": "Maximum tree depth",
      "description": "Maximum depth of the tree (>= 0).  E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.",
      "default": 5,
      "maximum": 20,
      "exclusiveMaximum": false,
      "minimum": 1,
      "exclusiveMinimum": false
    },
    "maxBins": {
      "type": "integer",
      "title": "Maximum number of discretizing bins",
      "description": "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.",
      "default": 32,
      "maximum": 128,
      "exclusiveMaximum": false,
      "minimum": 0,
      "exclusiveMinimum": false
    },
    "numTrees": {
      "type": "integer",
      "title": "Number of trees",
      "description": "Number of trees to train (>= 1)",
      "default": 20,
      "maximum": 1000,
      "exclusiveMaximum": false,
      "minimum": 1,
      "exclusiveMinimum": false
    },
    "type": {
      "type": "string",
      "title": "Spark Job Type",
      "enum": ["random_forests_classifier"],
      "default": "random_forests_classifier",
      "hints": ["readonly"]
    }
  },
  "additionalProperties": true,
  "category": "Other",
  "categoryPriority": 1,
  "unsafe": false,
  "propertyGroups": [{
    "label": "Input/Output Parameters",
    "properties": ["trainingCollection", "outputCollection", "dataFormat", "trainingDataFilterQuery", "trainingDataFrameConfigOptions", "trainingDataSamplingFraction", "randomSeed"]
  }, {
    "label": "Field Parameters",
    "properties": ["fieldToVectorize", "sourceFields", "predictedLabelField", "trainingLabelField"]
  }, {
    "label": "Model Tuning Parameters",
    "properties": ["w2vDimension", "w2vWindowSize", "w2vMaxIter", "w2vMaxSentenceLength", "w2vStepSize", "withIdf", "maxDF", "minDF", "norm", "autoBalanceClasses", "evaluationMetricType", "minTrainingSamplesPerClass", "otherClassName", "makeOtherClass", "gridSearch", "maxBins", "numTrees", "maxDepth"]
  }, {
    "label": "Featurization Parameters",
    "properties": ["analyzerConfig"]
  }, {
    "label": "Misc. Parameters",
    "properties": ["modelId"]
  }]
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-ai/reference/jobs/random-forest-classifier-training

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-ai/reference/jobs/random-forest-classifier-training

[old doc.lw link]: https://doc.lucidworks.com/fusion/5.9/579

Train a [random forest classifier](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.classification.RandomForestClassificationModel.html) for text classification.

<LwTemplate />

<SchemaParamFields schema={schema} />
