> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Apache Tika Parser Index Stage

export const schema = {
  "type": "object",
  "title": "Apache Tika Parser (deprecated)",
  "description": "A stage that uses Apache Tika for parsing rich document formats like PDF, Word, etc.",
  "properties": {
    "skip": {
      "type": "boolean",
      "title": "Skip This Stage",
      "description": "Set to true to skip this stage.",
      "default": false,
      "hints": ["advanced"]
    },
    "label": {
      "type": "string",
      "title": "Label",
      "description": "A unique label for this stage.",
      "hints": ["advanced"],
      "maxLength": 255
    },
    "condition": {
      "type": "string",
      "title": "Condition",
      "description": "Define a conditional script that must result in true or false. This can be used to determine if the stage should process or not.",
      "hints": ["code", "code/javascript", "advanced"]
    },
    "includeImages": {
      "type": "boolean",
      "title": "Include Images",
      "default": false
    },
    "flattenCompound": {
      "type": "boolean",
      "title": "Flatten Compound Documents",
      "default": false
    },
    "addFailedDocs": {
      "type": "boolean",
      "title": "Add Failed Documents",
      "default": false
    },
    "addOriginalContent": {
      "type": "boolean",
      "title": "Add Original Document Content (Raw Bytes)",
      "default": false
    },
    "contentField": {
      "type": "string",
      "title": "Field Name Where Content is Expected",
      "default": "_raw_content_"
    },
    "contentEncoding": {
      "type": "string",
      "title": "Content Transport Encoding of the Content (per RFC1341)",
      "enum": ["binary", "base64"],
      "default": "binary"
    },
    "returnXml": {
      "type": "boolean",
      "title": "Return Parsed Content as XML or HTML",
      "default": false
    },
    "keepOriginalStructure": {
      "type": "boolean",
      "title": "Return Original XML and HTML Instead of Tika XML Output",
      "default": false
    },
    "extractHtmlLinks": {
      "type": "boolean",
      "title": "Extract XHTML Links",
      "description": "Collect links explicitly declared in document structure (e.g. using HTML tags, bookmarks, etc)",
      "default": true
    },
    "extractOtherLinks": {
      "type": "boolean",
      "title": "Extract Other Links",
      "description": "Use regex-based heuristic extractor to collect likely links from plain text content in all fields.",
      "default": false
    },
    "includeContentTypes": {
      "type": "array",
      "title": "Content Types to Include",
      "description": "List of content types to parse",
      "items": {
        "type": "string"
      }
    },
    "excludeContentTypes": {
      "type": "array",
      "title": "Content Types to Exclude",
      "description": "List of content types to exclude from parsing",
      "items": {
        "type": "string"
      }
    },
    "zipBombCompressionRatio": {
      "type": "integer",
      "title": "Maximum input-to-output byte ratio",
      "description": "Maximum number of output bytes fusion will generate per input byte. If you are indexing highly compressed files, you may increase this value to avoid triggering 'Zip Bomb' detection",
      "default": 200
    },
    "zipBombMaxDepth": {
      "type": "integer",
      "title": "Maximum nesting depth",
      "description": "Returns the maximum XML element nesting level. If you are indexing highly nested files, you may increase this value to avoid triggering 'Zip Bomb' detection",
      "default": 200
    },
    "zipBombMaxPackageEntryDepth": {
      "type": "integer",
      "title": "Maximum package entry depth",
      "description": "Sets the maximum package entry nesting level. If you are indexing highly nested files, you may increase this value to avoid triggering 'Zip Bomb' detection",
      "default": 20
    }
  },
  "category": "Deprecated",
  "categoryPriority": 1,
  "unsafe": false
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-server/reference/pipeline-stages/indexing/apache-tika-parser-index-stage

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-server/reference/pipeline-stages/indexing/apache-tika-parser-index-stage

[old doc.lw link]: https://doc.lucidworks.com/fusion-server/4.2/207

The Apache Tika Parser index stage type includes rules for parsing documents with [Apache Tika](http://tika.apache.org/).
Fusion uses Tika v1.13.
(Note that components of the Solr distribution included with Fusion contain their own Tika jar files; these are not used by Fusion.)

<LwTemplate />

## Compatibility Issues

Raw streams create new docs which Tika then tries to parse again. For this reason, avoid using the SDK connector or any other client that streams to the index pipeline.

## Configuration

<Tip>
  When entering configuration values in the UI, use *unescaped* characters, such as `\t` for the tab character. When entering configuration values in the API, use *escaped* characters, such as `\\t` for the tab character.
</Tip>

<SchemaParamFields schema={schema} />