> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Synonym and Similar Queries Detection Jobs

export const schema = {
  "type": "object",
  "title": "Synonym and Similar Queries Detection",
  "description": "Use this job to generate synonym and similar query pairs.",
  "required": ["id", "trainingCollection", "fieldToVectorize", "countField", "docIdField", "type"],
  "properties": {
    "id": {
      "type": "string",
      "title": "Spark Job ID",
      "description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
      "maxLength": 128,
      "pattern": "^[A-Za-z0-9_\\-]+$"
    },
    "trainingCollection": {
      "type": "string",
      "title": "Input Collection",
      "description": "Collection containing queries, document id and event counts. Can be either signal aggregation collection or raw signals collection.",
      "minLength": 1
    },
    "fieldToVectorize": {
      "type": "string",
      "title": "Query Field Name",
      "description": "Field containing queries. Change to query_s to use aggregation collection",
      "default": "query",
      "minLength": 1
    },
    "dataFormat": {
      "type": "string",
      "title": "Data format",
      "description": "Spark-compatible format which training data comes in (like 'solr', 'hdfs', 'file', 'parquet' etc)",
      "enum": ["solr", "hdfs", "file", "parquet"],
      "default": "solr",
      "hints": ["advanced"]
    },
    "trainingDataFrameConfigOptions": {
      "type": "object",
      "title": "Dataframe Config Options",
      "description": "Additional spark dataframe loading configuration options",
      "properties": {},
      "additionalProperties": {
        "type": "string"
      },
      "hints": ["advanced"]
    },
    "trainingDataFilterQuery": {
      "type": "string",
      "title": "Data filter query",
      "description": "Solr query to additionally filter the input collection.",
      "default": "*:*",
      "hints": ["dummy"],
      "minLength": 3
    },
    "trainingDataSamplingFraction": {
      "type": "number",
      "title": "Training data sampling fraction",
      "description": "Fraction of the training data to use",
      "default": 1,
      "hints": ["advanced"],
      "maximum": 1,
      "exclusiveMaximum": false
    },
    "randomSeed": {
      "type": "integer",
      "title": "Random seed",
      "description": "For any deterministic pseudorandom number generation",
      "default": 1234,
      "hints": ["advanced"]
    },
    "outputCollection": {
      "type": "string",
      "title": "Output Collection",
      "description": "Collection to store synonym and similar query pairs.",
      "hints": ["dummy"]
    },
    "overwriteOutput": {
      "type": "boolean",
      "title": "Overwrite Output",
      "description": "Overwrite output collection",
      "default": true,
      "hints": ["hidden", "advanced"]
    },
    "sourceFields": {
      "type": "string",
      "title": "Fields to Load",
      "description": "Solr fields to load (comma-delimited). Leave empty to allow the job to select the required fields to load at runtime.",
      "hints": ["hidden"]
    },
    "misspellingCollection": {
      "type": "string",
      "title": "Misspelling Job Result Collection",
      "description": "Solr collection containing reviewed result of Token and phrase spell correction job. Defaults to the query_rewrite_staging collection for the app."
    },
    "misspellingsFilterQuery": {
      "type": "string",
      "title": "Misspelling Job Result Filter Query",
      "description": "Solr query to additionally filter the misspelling results. Defaults to reading all approved spell corrections.",
      "default": "type:spell"
    },
    "keyPhraseCollection": {
      "type": "string",
      "title": "Phrase Extraction Job Result Collection",
      "description": "Solr collection containing reviewed result of Phrase extraction job. Defaults to the query_rewrite_staging collection for the app."
    },
    "keyPhraseFilterQuery": {
      "type": "string",
      "title": "Phrase Extraction Job Result Filter Query",
      "description": "Solr query to additionally filter the phrase extraction results. Defaults to reading all approved phrases.",
      "default": "type:phrase"
    },
    "countField": {
      "type": "string",
      "title": "Event Count Field Name",
      "description": "Solr field containing number of events (e.g., number of clicks). Change to aggr_count_i to use aggregated signals",
      "default": "count_i"
    },
    "docIdField": {
      "type": "string",
      "title": "Document id Field Name",
      "description": "Solr field containing document id that user clicked. Change to doc_id_s for aggregation collection",
      "default": "doc_id"
    },
    "overlapThreshold": {
      "type": "number",
      "title": "Query Similarity Threshold",
      "description": "The threshold above which query pairs are consider similar. We can get more synonym pairs if increase this value but quality may get reduced.",
      "default": 0.5,
      "hints": ["advanced"]
    },
    "similarityThreshold": {
      "type": "number",
      "title": "Synonym Similarity Threshold",
      "description": "The threshold above which synonym pairs are consider similar. We can get more synonym pairs if increase this value but quality may get reduced.",
      "default": 0.01,
      "hints": ["advanced"]
    },
    "minQueryCount": {
      "type": "integer",
      "title": "Query Clicks Threshold",
      "description": "The min number of clicked documents needed for comparing queries.",
      "default": 5,
      "hints": ["advanced"]
    },
    "keywordsBlobName": {
      "type": "string",
      "title": "Keywords Blob Store",
      "description": "Name of the keywords blob resource. Typically, this should be a csv file uploaded to blob store in a specific format. Check documentation for more details on format and uploading to blob store.",
      "reference": "blob",
      "blobType": "file:spark"
    },
    "analyzerConfigQuery": {
      "type": "string",
      "title": "Lucene Analyzer Schema",
      "description": "LuceneTextAnalyzer schema for tokenizing queries (JSON-encoded)",
      "default": "{ \"analyzers\": [ { \"name\": \"LetterTokLowerStem\",\"charFilters\": [ { \"type\": \"htmlstrip\" } ],\"tokenizer\": { \"type\": \"letter\" },\"filters\": [{ \"type\": \"lowercase\" },{ \"type\": \"length\", \"min\": \"2\", \"max\": \"32767\" },{ \"type\": \"KStem\" }] }],\"fields\": [{ \"regex\": \".+\", \"analyzer\": \"LetterTokLowerStem\" } ]}",
      "hints": ["lengthy", "advanced", "code/json"],
      "minLength": 1
    },
    "stopwordsList": {
      "type": "array",
      "title": "List of stopwords",
      "description": "Stopwords defined in Lucene analyzer config",
      "hints": ["readonly", "hidden"],
      "items": {
        "type": "string",
        "minLength": 1,
        "reference": "blob",
        "blobType": "file:spark"
      }
    },
    "enableAutoPublish": {
      "type": "boolean",
      "title": "Enable auto-publishing",
      "description": "If true, automatically publishes rewrites for rules. Default is false to allow for initial human-aided reviewing",
      "default": false,
      "hints": ["advanced"]
    },
    "type": {
      "type": "string",
      "title": "Spark Job Type",
      "enum": ["synonymDetection"],
      "default": "synonymDetection",
      "hints": ["readonly"]
    }
  },
  "additionalProperties": true,
  "category": "Other",
  "categoryPriority": 1,
  "unsafe": false,
  "propertyGroups": [{
    "label": "Input/Output Parameters",
    "properties": ["trainingCollection", "outputCollection", "dataFormat", "trainingDataFilterQuery", "trainingDataFrameConfigOptions", "trainingDataSamplingFraction", "randomSeed"]
  }, {
    "label": "Field Parameters",
    "properties": ["fieldToVectorize", "sourceFields", "countField"]
  }, {
    "label": "Model Tuning Parameters",
    "properties": ["overlapThreshold"]
  }, {
    "label": "Featurization Parameters",
    "properties": ["analyzerConfigQuery"]
  }, {
    "label": "Misc. Parameters",
    "properties": ["keywordsBlobName"]
  }]
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-ai/reference/jobs/synonym-and-similar-queries-detection

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-ai/reference/jobs/synonym-and-similar-queries-detection

[old doc.lw link]: https://doc.lucidworks.com/fusion/5.9/563

Use this job to generate pairs of synonyms and pairs of similar queries. Two words are considered potential synonyms when they are used in a similar context in similar queries.

<Note>
  For best job speed and to avoid memory issues, use [aggregated signals](/docs/4/fusion-ai/concepts/signals-and-aggregations/aggregations/overview) instead of raw signals as input for this job.
</Note>

You can review, edit, deploy, or delete output from this job using the [Query Rewriting UI](/docs/4/fusion-ai/concepts/query-rewriting/overview).

**Output from the [Token and Phrase Spell Correction job](/docs/4/fusion-ai/reference/jobs/token-and-phrase-spell-correction) and the [Phrase Extraction job](/docs/4/fusion-ai/reference/jobs/phrase-extraction) can be used as input for this job.**

<LwTemplate />

## Input

This job takes one or more of the following as input:

* [signals](#signal-data) (required)
* [misspelling job results](#misspelling-job-results)
* [phrase detection job results](#phrase-detection-job-results)
* [keywords](#keywords)
* [synonyms](#custom-synonyms)

### Signal data

This input is required; additional input is optional. [Signal data](/docs/4/fusion-ai/concepts/signals-and-aggregations/signals/overview) can be either raw or aggregated. The job runs faster using [aggregated signals](/docs/4/fusion-ai/concepts/signals-and-aggregations/aggregations/overview). When raw signals are used as input, this job performs the aggregation.

Use the `trainingCollection`/**Input Collection** parameter to specify the collection that contains the signal data.

### Misspelling job results

[Token and Phrase Spell Correction job](/docs/4/fusion-ai/reference/jobs/token-and-phrase-spell-correction) results can be used to avoid finding mainly misspellings, or mixing synonyms with misspellings.

Use the `misspellingCollection`/**Misspelling Job Result Collection** parameter to specify the collection that contains these results.

### Phrase detection job results

[Phrase Extraction job](/docs/4/fusion-ai/reference/jobs/phrase-extraction) results can be used to find synonyms with multiple tokens, such as "lithium ion" and "ion battery".

Use the `keyPhraseCollection`/**Phrase Extraction Job Result Collection** parameter to specify the collection that contains these results.

### Keywords

A keywords list in the [blob store](/docs/4/fusion-server/concepts/indexing/blob-storage) can serve as a blacklist to prevent common attributes from being identified as potential synonyms.

The list can include common attributes such as color, brand, material, and so on. For example, by including color attributes you can prevent "red" and "blue" from being identified as synonyms due to their appearance in similar queries such as "red bike" and "blue bike".

The keywords file is in CSV format with two fields: `keyword` and `type`. You can add your custom keywords list here with the `type` value "stopwords". An example file is shown below:

```
keyword,type
cu,stopword
ft,stopword
mil,stopword
watt,stopword
wat,stopword
foot,stopword
feet,stopword
gal,stopword
unit,stopword
lb,stopword
wt,stopword
cc,stopword
cm,stopword
kg,stopword
km,stopword
oz,stopword
nm,stopword
qt,stopword
sale,stopword
on sale,stopword
for sale,stopword
clearance,stopword
gb,stopword
gig,stopword
color,stopword
blue,stopword
white,stopword
black,stopword
ivory,stopword
grey,stopword
brown,stopword
silver,stopword
light blue,stopword
light ivory,stopword
light grey,stopword
light brown,stopword
light silver,stopword
light green,stopword
```

Use the `keywordsBlobName`/**Keywords Blob Store** parameter to specify the name of the blob that contains this list.

### Custom Synonyms

For some deployments there might be a need to use existing synonym definitions. You can import existing synonyms into the Synonym and Similar Queries Detection job as a text file. Upload your synonyms text file to the blob store and reference that file when creating the job.

## Output

The output collection contains two tables distinguished by the `doc_type` field.

### The similar queries table

If `query` leads to clicks on documents 1, 2, 3, and 4, and `similar_query` leads to clicks on documents 2, 3, 4, and 5, then there is sufficient overlap between the two queries to consider them similar.

A statistic is constructed to compute similarities based on overlap counts and query counts. The resulting table consists of documents whose `doc_type` value is "query\_rewrite" and `type` value is "simq".

The similar queries table contains similar query pairs with these fields:

|                       |                                                                                                                                                                                                                                                              |
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `query`               | The first half of the two-query pair.                                                                                                                                                                                                                        |
| `similar_query`       | The second half of the two-query pair.                                                                                                                                                                                                                       |
| `similarity`          | A score between 0 and 1 indicating how similar the two queries are.  All `similarity` values are greater than or equal to the configured Query Similarity Threshold to ensure that only high-similarity queries are kept and used as input to find synonyms. |
| `query_count`         | The number of clicks received by the `query_count` query.  To save computation time, only queries with at least as many clicks as the configured Query Clicks Threshold parameter are kept and used as input to find synonyms.                               |
| `similar_query_count` | The number of clicks received by the `similar_query_count` query.                                                                                                                                                                                            |

### The synonyms table

The synonyms table consists of documents whose `doc_type` value is "query\_rewrite" and `type` value is "synonym":

|                |                                                                                                                                                                                 |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `surface_form` | The first half of the two-synonym pair.                                                                                                                                         |
| `synonym`      | The second half of the two-synonym pair.                                                                                                                                        |
| `context`      | If there are more than two words or phrases with the same meaning, such as "macbook, apple mac, mac", then this field shows the group to which this pair belongs.               |
| `similarity`   | A similarity score to measure confidence.                                                                                                                                       |
| `count`        | The number of different contexts in which this synonym pair appears. <Tip> The bigger the number, the higher the quality of the pair. </Tip>                                    |
| `suggestion`   | The algorithm automatically selects `context`, synonym words or phrases, or the `synonym_group`, and puts it in this field. <Tip> Use this field as the field to review. </Tip> |
| `category`     | Whether the synonym is actually a misspelling.                                                                                                                                  |

<SchemaParamFields schema={schema} />
