> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# OpenNLP NER Extraction Index Stage

export const schema = {
  "type": "object",
  "title": "OpenNLP NER Extraction",
  "description": "This stage allows you to extract named entities using natural language processing models",
  "required": ["rules"],
  "properties": {
    "skip": {
      "type": "boolean",
      "title": "Skip This Stage",
      "description": "Set to true to skip this stage.",
      "default": false,
      "hints": ["advanced"]
    },
    "label": {
      "type": "string",
      "title": "Label",
      "description": "A unique label for this stage.",
      "hints": ["advanced"],
      "maxLength": 255
    },
    "condition": {
      "type": "string",
      "title": "Condition",
      "description": "Define a conditional script that must result in true or false. This can be used to determine if the stage should process or not.",
      "hints": ["code", "code/javascript", "advanced"]
    },
    "rules": {
      "type": "array",
      "title": "Extractor Rules",
      "minItems": 1,
      "items": {
        "type": "object",
        "required": ["source", "target", "sentenceModelLocation", "tokenizerModelLocation"],
        "properties": {
          "source": {
            "type": "array",
            "title": "Source Fields",
            "minItems": 1,
            "items": {
              "type": "string",
              "format": "regex"
            }
          },
          "target": {
            "type": "string",
            "title": "Target Field"
          },
          "writeMode": {
            "type": "string",
            "title": "Write Mode",
            "description": "What to do if document has target field already",
            "enum": ["overwrite", "append"],
            "default": "append",
            "hints": ["advanced"]
          },
          "sentenceModelLocation": {
            "type": "string",
            "title": "Sentence Model",
            "reference": "blob",
            "blobType": "model:open-nlp"
          },
          "tokenizerModelLocation": {
            "type": "string",
            "title": "Tokenizer Model",
            "reference": "blob",
            "blobType": "model:open-nlp"
          },
          "entityTypes": {
            "type": "array",
            "title": "Entity Types",
            "items": {
              "type": "object",
              "required": ["name", "definition"],
              "properties": {
                "name": {
                  "type": "string",
                  "title": "Entity Name"
                },
                "definition": {
                  "type": "string",
                  "title": "Entity Definition",
                  "reference": "blob",
                  "blobType": "model:open-nlp"
                }
              }
            }
          }
        }
      }
    }
  },
  "category": "Natural Language Processing",
  "categoryPriority": 5,
  "unsafe": false
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-ai/reference/index-pipeline-stages/opennlp-ner-extractor-index-stage

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-ai/reference/index-pipeline-stages/opennlp-ner-extractor-index-stage

[old doc.lw link]: https://doc.lucidworks.com/fusion-ai/4.2/542

Named Entity Recognition (NER) is the task of finding the names of persons, organizations, locations, and/or things in a passage of free text. The OpenNLP NER Extraction index stage (previously called the OpenNLP NER Extractor stage) uses a set of rules to find named entities in a field in the Pipeline Document (the "source") and populates a new field (the "target") with these entities.

This stage uses [Apache OpenNLP](http://opennlp.apache.org/) project’s [Named Entity Recognition tool](http://opennlp.apache.org/docs/#tools.namefind.recognition) (the Name Finder tool). The OpenNLP documentation states:

> The Name Finder tool can detect named entities and numbers in text. To be able to detect entities the Name Finder needs a model. The model is dependent on the language and entity type it was trained for. The OpenNLP projects offers a number of pre-trained name finder models which are trained on various freely available corpora. They can be downloaded at our model download page. To find names in raw text the text must be segmented into tokens and sentences.

Fusion 4.x.x contains a common set of NER models for English that include sentence, token, and part-of-speech models. These models are:

| Model                                | Purpose                                       |
| ------------------------------------ | --------------------------------------------- |
| `nlp/models/en-sent.bin`             | Sentence model to detect sentences            |
| `nlp/models/en-token.bin`            | Tokenizer model for tokenization of sentences |
| `nlp/models/en-ner-date.bin`         | Date name finder model                        |
| `nlp/models/en-ner-location.bin`     | Location name finder model                    |
| `nlp/models/en-ner-money.bin`        | Money name finder model                       |
| `nlp/models/en-ner-organization.bin` | Organization name finder model                |
| `nlp/models/en-ner-percentage.bin`   | Percentage name finder model                  |
| `nlp/models/en-ner-person.bin`       | Person name finder model                      |
| `nlp/models/en-ner-time.bin`         | Time name finder model                        |

<Note>
  See [OpenNLP 1.5 series](http://opennlp.sourceforge.net/models-1.5/) for additional pre-trained OpenNLP models.
</Note>

To use these models, upload to Fusion using the [Fusion Blob Store service](/docs/4/fusion-server/concepts/indexing/blob-storage). Here is an example of how to upload the sentence model file using the `curl` command-line utility, where "admin" is the name of a user with admin privileges, and "pass" is the password:

```bash wrap  theme={"dark"}
curl -u USERNAME:PASSWORD -X PUT --data-binary @data/nlp/models/en-sent.bin -H 'Content-type: application/octet-stream' http://localhost:8764/api/blobs/en-sent.bin
```

See [Natural Language Processing](/docs/4/fusion-ai/concepts/nlp) for more information.

<LwTemplate />

## Example Specification

*Specification of a stage which extracts names of people and places from field named 'body':*

```json wrap  theme={"dark"}
{
   "type":"nlp-extractor",
   "id":"iqtr",
   "rules":[
      {
         "source":[
            "body_t"
         ],
         "target":"organizations",
         "writeMode":"append",
         "sentenceModelLocation":"nlp/models/en-sent.bin",
         "tokenizerModelLocation":"nlp/models/en-token.bin",
         "entityTypes":[
            {
               "name":"organization",
               "definition":"nlp/models/en-ner-organization.bin"
            }
         ]
      },
      {
         "source":[
            "body_t"
         ],
         "target":"persons",
         "writeMode":"append",
         "sentenceModelLocation":"nlp/models/en-sent.bin",
         "tokenizerModelLocation":"nlp/models/en-token.bin",
         "entityTypes":[
            {
               "name":"person",
               "definition":"nlp/models/en-ner-person.bin"
            }
         ]
      }
   ],
   "type":"nlp-extractor",
   "skip":false,
   "label":"Extract Entities",
   "licensed":true,
   "secretSourceStageId":"iqtr"
}
```

## Configuration

<Tip>
  When entering configuration values in the UI, use *unescaped* characters, such as `\t` for the tab character. When entering configuration values in the API, use *escaped* characters, such as `\\t` for the tab character.
</Tip>

<SchemaParamFields schema={schema} />
