> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Tag Part-of-Speech Index Stage

export const schema = {
  "type": "object",
  "title": "Tag Part-of-Speech",
  "description": "Tag sentences with part-of-speech information. Requires Sentence Detection on the same fields earlier in the pipeline",
  "required": ["tokenizerModel", "posModel", "source"],
  "properties": {
    "skip": {
      "type": "boolean",
      "title": "Skip This Stage",
      "description": "Set to true to skip this stage.",
      "default": false,
      "hints": ["advanced"]
    },
    "label": {
      "type": "string",
      "title": "Label",
      "description": "A unique label for this stage.",
      "hints": ["advanced"],
      "maxLength": 255
    },
    "condition": {
      "type": "string",
      "title": "Condition",
      "description": "Define a conditional script that must result in true or false. This can be used to determine if the stage should process or not.",
      "hints": ["code", "code/javascript", "advanced"]
    },
    "tokenizerModel": {
      "type": "string",
      "title": "Tokenizer Model",
      "reference": "blob",
      "blobType": "model:open-nlp"
    },
    "posModel": {
      "type": "string",
      "title": "Part of Speech Model",
      "reference": "blob",
      "blobType": "model:open-nlp"
    },
    "source": {
      "type": "array",
      "title": "Source Fields",
      "minItems": 1,
      "items": {
        "type": "string"
      }
    }
  },
  "category": "Natural Language Processing",
  "categoryPriority": 5,
  "unsafe": false
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-server/reference/pipeline-stages/indexing/part-of-speech-index-stage

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-server/reference/pipeline-stages/indexing/part-of-speech-index-stage

[old doc.lw link]: https://doc.lucidworks.com/fusion-server/4.2/228

The Tag Part-of-Speech Index stage (previously called the Part of Speech stage) operates over one of more fields in the Pipeline Document. It marks sentences with part of speech information as annotations which can be used by downstream indexing stages. Therefore this stage requires a Detect Sentences stage defined over these fields earlier in the pipeline.

This stage uses [Apache OpenNLP](http://opennlp.apache.org/) project’s [Part of Speech Tagger](http://opennlp.apache.org/docs/#tools.postagger) to mark tokens with their corresponding word type based on the token itself and the context of the token. The OpenNLP documentation states:

"A token might have multiple pos tags depending on the token and the context. The OpenNLP POS Tagger uses a probability model to predict the correct pos tag out of the tag set. To limit the possible tags for a token a tag dictionary can be used which increases the tagging and runtime performance of the tagger."

Models are available from the OpenNLP [models SourceForge repository](http://opennlp.sourceforge.net/models-1.5/). Model files must be uploaded to Fusion using the [Fusion Blob Store service](/docs/4/fusion-server/reference/api/blob-store-api) via the REST API.

<LwTemplate />

## Part-of-speech Tagging in a NLP Pipeline

The following video shows how to use a Part-of-speech indexing stage as part of an NLP pipeline:

<Frame>
  <iframe width="640" height="480" src="https://www.youtube.com/embed/bbbLAMnfHjY" title="Fusion Learners - OpenNLP and Gazetteer" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen />
</Frame>

## Stage Setup

Here is an example of how to upload a part-of-speech model file to the Fusion blob store:

**INPUT**

```bash wrap  theme={"dark"}
curl -u USERNAME:PASSWORD -X PUT --data-binary @en-pos-maxent.bin -H 'Content-type: text/plain' http://localhost:8764/api/blobs/en-pos-maxent.bin
```

**OUTPUT**

```json wrap  theme={"dark"}
{
  "name" : "en-pos-maxent.bin",
  "contentType" : "text/plain",
  "size" : 5696197,
  "modifiedTime" : "2015-07-15T06:57:48.636Z",
  "version" : 0,
  "md5" : "db2cd70395b9e2e4c6b9957015a10607"
}
```

This is an example setup of this stage using the previously loaded .bin file:

**INPUT**

```bash wrap  theme={"dark"}
curl -u USERNAME:PASSWORD -X POST -H 'Content-type: application/json' -d '{"id":"TagPartofSpeech1", "type": "tag-part-of-speech","tokenizerModel":"en-pos-maxent.bin","posModel":"en-pos-perceptron.bin","source": ["sample","text","for","NLP"]}' http://localhost:8764/api/index-stages/instances
```

**OUTPUT**

```json wrap  theme={"dark"}
{
  "type" : "tag-part-of-speech",
  "id" : "TagPartofSpeech1",
  "posModel" : "en-pos-perceptron.bin",
  "tokenizerModel" : "en-sent.bin",
  "source" : [ "sample", "text", "for", "NLP" ],
  "skip" : false,
  "label" : "tag-part-of-speech",
  "type" : "tag-part-of-speech"
}
```

## Configuration

<Tip>
  When entering configuration values in the UI, use *unescaped* characters, such as `\t` for the tab character. When entering configuration values in the API, use *escaped* characters, such as `\\t` for the tab character.
</Tip>

<SchemaParamFields schema={schema} />
