> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Gazetteer Lookup Extraction Index Stage

export const schema = {
  "type": "object",
  "title": "Gazetteer Lookup Extraction",
  "description": "This stage allows you to extract entities using pre-defined gazetteers",
  "properties": {
    "skip": {
      "type": "boolean",
      "title": "Skip This Stage",
      "description": "Set to true to skip this stage.",
      "default": false,
      "hints": ["advanced"]
    },
    "label": {
      "type": "string",
      "title": "Label",
      "description": "A unique label for this stage.",
      "hints": ["advanced"],
      "maxLength": 255
    },
    "condition": {
      "type": "string",
      "title": "Condition",
      "description": "Define a conditional script that must result in true or false. This can be used to determine if the stage should process or not.",
      "hints": ["code", "code/javascript", "advanced"]
    },
    "rules": {
      "type": "array",
      "title": "Extraction Rules",
      "items": {
        "type": "object",
        "required": ["source", "target", "entityTypes"],
        "properties": {
          "source": {
            "type": "array",
            "title": "Source Fields",
            "minItems": 1,
            "items": {
              "type": "string",
              "format": "regex"
            }
          },
          "target": {
            "type": "string",
            "title": "Target Field"
          },
          "writeMode": {
            "type": "string",
            "title": "Write Mode",
            "description": "What to do if document has target field already",
            "enum": ["overwrite", "append"],
            "default": "append",
            "hints": ["advanced"]
          },
          "entityTypes": {
            "type": "array",
            "title": "Entity Types",
            "minItems": 1,
            "items": {
              "type": "object",
              "required": ["name", "entityDefinitions"],
              "properties": {
                "name": {
                  "type": "string",
                  "title": "Entity Name"
                },
                "entityDefinitions": {
                  "type": "array",
                  "title": "Entity Definitions (Model files)",
                  "minItems": 1,
                  "items": {
                    "type": "string",
                    "reference": "blob",
                    "blobType": "model:open-nlp"
                  }
                }
              }
            }
          },
          "additionalEntities": {
            "type": "array",
            "title": "Additional Entities",
            "items": {
              "type": "object",
              "required": ["name", "entityDefinitions"],
              "properties": {
                "name": {
                  "type": "string",
                  "title": "Entity Name"
                },
                "entityDefinitions": {
                  "type": "array",
                  "title": "Entity Definitions (List of entities)",
                  "minItems": 1,
                  "items": {
                    "type": "string"
                  }
                }
              }
            }
          },
          "caseSensitive": {
            "type": "boolean",
            "title": "Case Sensitive",
            "default": false
          }
        }
      }
    }
  },
  "category": "Natural Language Processing",
  "categoryPriority": 5,
  "unsafe": false
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-server/reference/pipeline-stages/indexing/gazetteer-lookup-extractor-index-stage

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-server/reference/pipeline-stages/indexing/gazetteer-lookup-extractor-index-stage

[old doc.lw link]: https://doc.lucidworks.com/fusion/5.9/220

The Gazetteer Lookup Extraction index stage (called the Gazetteer Lookup Extractor stage in versions earlier than 3.0) uses predefined lists of words and phrases to process specified text fields in a document.
A gazetteer is a set of lookup lists over names of people, places, or things.
These lookup lists are used to find occurrences of these names in text.
The matched items are saved into separate fields on the document for downstream processing.

<LwTemplate />

## Gazetteers and OpenNLP Tools

The following video shows how to configure a Gazetteer Lookup Extraction stage in combination with OpenNLP:

<Frame>
  <iframe width="640" height="480" src="https://www.youtube.com/embed/bbbLAMnfHjY" title="Fusion Learners - OpenNLP and Gazetteer" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen />
</Frame>

## Uploading Lookup Lists to Fusion Blob Store

Fusion includes a number of lookup lists in the directory https\://FUSION\_HOST:FUSION\_PORT/data/nlp/gazetteer.
To use the supplied lists or a list of your own data, each must list be uploaded to Fusion
using the [Blob Store API](/docs/4/fusion-server/reference/api/blob-store-api)
in order to make the list contents available to the Gazetteer Lookup Extraction stage.

For example, to identify color names, you would first compile a list of color terms, one entry per line in a text file with suffix `.lst` and then
upload that file using the Fusion REST API endpoint `api/blobs/<listfilename>`, as per the following example which
uses the `curl` command-line utility, where 'admin' is the name of a user with admin privileges, and 'pass' is that user’s password:

```bash wrap  theme={"dark"}
curl -u USERNAME:PASSWORD -X PUT --data-binary @data/nlp/gazetteer/colors.lst -H 'Content-type: text/plain' http://localhost:8764/api/blobs/colors.lst
```

## Name Lookup Example

*Define a lookup-extractor to identify mentions of certain celebrities in text field `description_t`:*

```json wrap  theme={"dark"}
{
  "type" : "lookup-extractor",
  "id" : "peopleLookup",
  "rules" : [ {
    "source" : [ "description_t" ],
    "target" : "celebrities_ss",
    "entityTypes" : [ {
      "name" : "person_female",
      "definitions" : [ "person_female.lst" ]
    } ],
    "additionalEntities" : [ {
      "name" : "players",
      "definitions" : [ "sharapova", "murray" ]
    }, {
      "name" : "actors",
      "definitions" : [ "pitt", "jolie" ]
    } ],
    "caseSensitive" : false
  } ],
  "skip" : false
}
```

## Configuration

<Tip>
  When entering configuration values in the UI, use *unescaped* characters, such as `\t` for the tab character. When entering configuration values in the API, use *escaped* characters, such as `\\t` for the tab character.
</Tip>

<SchemaParamFields schema={schema} />
