> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Collection Analysis Jobs

export const schema = {
  "type": "object",
  "title": "Collection Analysis",
  "description": "Use this job when you want to compute basic metrics about your collection such as average word length, phrase percentages, and outlier documents (with large or few number of terms)",
  "required": ["id", "trainingCollection", "fieldToVectorize", "numDeviations", "type"],
  "properties": {
    "id": {
      "type": "string",
      "title": "Spark Job ID",
      "description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
      "maxLength": 128,
      "pattern": "^[A-Za-z0-9_\\-]+$"
    },
    "trainingCollection": {
      "type": "string",
      "title": "Training Collection",
      "description": "Collection you want to analyze.",
      "minLength": 1
    },
    "fieldToVectorize": {
      "type": "string",
      "title": "Field to Vectorize",
      "description": "Field you want to analyze the lengths of. For example: description fields.",
      "minLength": 1
    },
    "dataFormat": {
      "type": "string",
      "title": "Data format",
      "description": "Spark-compatible format which training data comes in (like 'solr', 'hdfs', 'file', 'parquet' etc)",
      "enum": ["solr", "hdfs", "file", "parquet"],
      "default": "solr",
      "hints": ["advanced"]
    },
    "trainingDataFrameConfigOptions": {
      "type": "object",
      "title": "Dataframe Config Options",
      "description": "Additional spark dataframe loading configuration options",
      "properties": {},
      "additionalProperties": {
        "type": "string"
      },
      "hints": ["advanced"]
    },
    "trainingDataFilterQuery": {
      "type": "string",
      "title": "Training data filter query",
      "description": "Solr query to use when loading training data",
      "default": "*:*",
      "hints": ["advanced"],
      "minLength": 3
    },
    "trainingDataSamplingFraction": {
      "type": "number",
      "title": "Training data sampling fraction",
      "description": "Fraction of the training data to use",
      "default": 1,
      "hints": ["advanced"],
      "maximum": 1,
      "exclusiveMaximum": false
    },
    "randomSeed": {
      "type": "integer",
      "title": "Random seed",
      "description": "For any deterministic pseudorandom number generation",
      "default": 1234,
      "hints": ["advanced"]
    },
    "outputCollection": {
      "type": "string",
      "title": "Output Collection",
      "description": "Solr Collection to store model-labeled data to"
    },
    "overwriteOutput": {
      "type": "boolean",
      "title": "Overwrite Output",
      "description": "Overwrite output collection",
      "default": true,
      "hints": ["hidden", "advanced"]
    },
    "sourceFields": {
      "type": "string",
      "title": "Fields to Load",
      "description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
      "hints": ["advanced"]
    },
    "numDeviations": {
      "type": "integer",
      "title": "Number of standard deviations for obtaining outliers",
      "description": "Number of standard deviations away from the mean we deem acceptable for obtaining outlier from this collection. If you want all the documents, set this to a larger number.",
      "minimum": 0,
      "exclusiveMinimum": false
    },
    "dateField": {
      "type": "string",
      "title": "Date Field",
      "description": "The field that corresponds to the date field you will be using"
    },
    "type": {
      "type": "string",
      "title": "Spark Job Type",
      "enum": ["collection_analysis"],
      "default": "collection_analysis",
      "hints": ["readonly"]
    }
  },
  "additionalProperties": true,
  "category": "Other",
  "categoryPriority": 1,
  "unsafe": false,
  "propertyGroups": [{
    "label": "Input/Output Parameters",
    "properties": ["trainingCollection", "outputCollection", "dataFormat", "trainingDataFilterQuery", "trainingDataFrameConfigOptions", "trainingDataSamplingFraction", "randomSeed"]
  }, {
    "label": "Field Parameters",
    "properties": ["fieldToVectorize", "sourceFields", "dateField"]
  }, {
    "label": "Model Tuning Parameters",
    "properties": ["numDeviations"]
  }]
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-ai/reference/jobs/collection-analysis

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-ai/reference/jobs/collection-analysis

[old doc.lw link]: https://doc.lucidworks.com/fusion-ai/4.2/581

Use this job when you want to compute basic metrics about your collection, like average word length, phrase percentages, and outlier documents (with very many or very few documents).

<LwTemplate />

<SchemaParamFields schema={schema} />
