> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# XML Transformation Index Stage

export const schema = {
  "type": "object",
  "title": "XML Transformation",
  "description": "This stage transforms XML contained in a given field on the input document to a new pipeline document with the extracted fields that match the XPath query.  Both the field and the value mapping rules may contain XPath expressions. If both the field and the value contain more than one XPath expression, the stage assumes they are paired.",
  "required": ["rootXPath"],
  "properties": {
    "skip": {
      "type": "boolean",
      "title": "Skip This Stage",
      "description": "Set to true to skip this stage.",
      "default": false,
      "hints": ["advanced"]
    },
    "label": {
      "type": "string",
      "title": "Label",
      "description": "A unique label for this stage.",
      "hints": ["advanced"],
      "maxLength": 255
    },
    "condition": {
      "type": "string",
      "title": "Condition",
      "description": "Define a conditional script that must result in true or false. This can be used to determine if the stage should process or not.",
      "hints": ["code", "code/javascript", "advanced"]
    },
    "rootXPath": {
      "type": "string",
      "title": "Root XPath",
      "description": "All relative XPath mappings will be made relative to this path."
    },
    "splitOnRoot": {
      "type": "boolean",
      "title": "New Document per Root XPath Match?",
      "description": "If true and there are more than one matches for the root XPath, then each root match will create a new document.  Defaults to true for backwards compatibility reasons.",
      "default": true
    },
    "parentIdField": {
      "type": "string",
      "title": "Parent ID Field Name",
      "description": "Add the parent document's ID onto the new document under this field name."
    },
    "bodyField": {
      "type": "string",
      "title": "Body Field Name",
      "description": "The field containing the XML document to process.",
      "default": "body"
    },
    "outputXMLFragments": {
      "type": "boolean",
      "title": "Output XML Fragments as Strings",
      "description": "If true, then XPath matches that result in a node selection will output the whole node as a String.  If false, just the text content of the node will be output.",
      "default": false
    },
    "mappings": {
      "type": "array",
      "title": "XPath Mappings",
      "description": "The XPath rules to apply to extract content from the designated Body field.  Extractions are added on to the document.",
      "items": {
        "type": "object",
        "required": ["xpath", "field"],
        "properties": {
          "xpath": {
            "type": "string",
            "title": "Value Expression",
            "description": "The expression for the value of the field.  May be a literal or an XPath.  If it is an XPath expression, it may be relative or absolute.  If relative, it is relative to the root XPath.  If absolute, see the documentation for the rules on how it is processed."
          },
          "field": {
            "type": "string",
            "title": "Field Expression",
            "description": "The name of the field.  May be an XPath entry or a literal.  If it is XPath, then the value that is resolved will be the name of one or more fields. If it is an XPath expression, it may be relative or absolute.  If relative, it is relative to the root XPath.  If absolute, see the documentation for the rules on how it is processed."
          },
          "multivalue": {
            "type": "boolean",
            "title": "Multi Value",
            "description": "If false and there is only one field output: use the first match of the Value Expression as the value entry.  If false and there is more than one field match, then the stage will attempt to match up pairs of fields and values (AKA it acts like a 'zip' function.  If true, then each field will have all matching values.",
            "default": false
          }
        }
      }
    },
    "metadata": {
      "type": "array",
      "title": "Additional Metadata",
      "description": "Pass in any additional key/value pairs to be added to the document.",
      "items": {
        "type": "object",
        "required": ["field", "value"],
        "properties": {
          "field": {
            "type": "string",
            "title": "Field",
            "description": "The name of the field.  May be an XPath expression.  If it is an XPath expression and the Value is a literal/static value, then the literal value will be used for all fields generated by this mapping."
          },
          "value": {
            "type": "string",
            "title": "Value",
            "description": "The value associated with the field.  May be an XPath expression or a literal string.  Note, literal strings have precedence over XPath expressions.  Practically speaking, this means a bare token (e.g. 'foo') will be treated as a literal and not a match any 'foo' tag XPath expressions.  If you want the latter, use '//foo'."
          }
        }
      }
    },
    "keepParent": {
      "type": "boolean",
      "title": "Keep Parent Document",
      "description": "If true, keep the parent document.  If false, the content extracted from the body field will create one or more new documents.",
      "default": false
    }
  },
  "category": "Document Transformation",
  "categoryPriority": 8,
  "unsafe": false
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/4/fusion-server/reference/pipeline-stages/indexing/xml-transform-index-stage

[mintlify link]: https://doc.lucidworks.com/docs/4/fusion-server/reference/pipeline-stages/indexing/xml-transform-index-stage

[old doc.lw link]: https://doc.lucidworks.com/fusion/5.9/245

The XML Transformation stage (previously called the XML Transform Stage) allows you to process an XML document into one or more Solr documents
and to specify mappings between elements and document fields.
A common use case for an XML Transformation stage in a pipeline is when the XML document
is a container-like document which contains a set of inner elements, each of which should be
treated as a separate document.
A parent ID field can be used to relate these multiple documents back to the containing document.

<LwTemplate />

## Pipeline Configuration

The default XML processing provided by the Apache Tika Parser index stage extracts all text from
an XML into a single document field called `content`.
This not only flattens the document contents, it loses all information about the containing
elements in the document.
To process XML documents using an XML Transformation stage, the index pipeline must have as its
initial processing stage an Apache Tika Parser index stage which is configured to pass the
document through to the XML Transformation stage *as raw XML*, via the following configuration:

* UI checkbox "Add original document content" **unchecked** / REST API property "addOriginalContent" set to **false**
* UI checkbox "Return parsed content as XML or HTML" **checked** / REST API property "keepOriginalStructure" set to **true**
* UI checkbox "Return original XML and HTML instead of Tika XML output" **checked** / REST API property "returnXml" set to **true**

With this configuration, the Tika parser stage decodes the raw input stream of bytes into a string containing the entire XML document
which is returned in the PipelineDocument field `body`.

The pipeline must have a Field Mapping stage after the XML Transformation stage, before the Solr Indexer stage. The Field Mapping stage is used to remove the following fields from the document:

* *raw-content*
* Content-Type
* Content-Length
* parsing
* parsing\_time

## XML Transforms

The XML Transformation stage uses a Solr
[XPathRecordReader](http://lucene.apache.org/solr/6_1_0/solr-dataimporthandler/org/apache/solr/handler/dataimport/XPathRecordReader.html)
which is a streaming XML parser that supports *only a limited subset of XPath selectors*.
It provides exact matching on element attributes and it can only extract the element text, not attribute values.

Examples of allowed XPath specifications where "a", "b", "c" are any element tags, likewise "attrName" is any attribute name:

```
/a/b/c
/a/b/c[@attrName='someValue']
/a/b/c[@attrName=]/d
/a/b/c/@attrName
//b//...
```

<Note>
  When specifying the list of `mappings`, for each mapping, the specification for the `xpath` attribute must include the full path, i.e., the `xpath` attribute will include the `rootXPath`. See the example configuration below.
</Note>

## Example Stage Specification

*Definition of an XML-Transformation stage that extracts elements from a MEDLINE/Pubmed article abstract:*

```json wrap  expandable  theme={"dark"}
{ "type" : "xml-transform",
  "id" : "n0j2a9k9",
  "rootXPath" : "/MedlineCitationSet/MedlineCitation",
  "bodyField" : "body",
  "mappings" : [ {
      "xpath" : "/MedlineCitationSet/MedlineCitation/Article/ArticleTitle",
      "field" : "article-title_txt",
      "multivalue" : false
  }, {
      "xpath" : "/MedlineCitationSet/MedlineCitation/Article/Abstract/AbstractText",
      "field" : "article-abstract_txt",
      "multivalue" : true
  }, {
      "xpath" : "/MedlineCitationSet/MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName",
      "field" : "mesh-heading_txt",
      "multivalue" : true
  }, {
      "xpath" : "/MedlineCitationSet/MedlineCitation/PMID",
      "field" : "pmid_txt",
      "multivalue" : false
  } ],
  "keepParent" : false,
  "skip" : false,
  "label" : "medline_xml_transform",
}
```

*Template for a minimal index pipeline that includes an XML-Transformation stage. Replace the XPath and field names in the XML-Transformation stage according to your data.*

```json wrap  theme={"dark"}
{
    "id" : "xml-pipeline-default",
    "stages" : [ {
    "type" : "tika-parser",
    "includeImages" : false,
    "flattenCompound" : false,
    "addFailedDocs" : false,
    "addOriginalContent" : false,
    "contentField" : "_raw_content_",
    "returnXml" : true,
    "keepOriginalStructure" : true,
    "extractHtmlLinks" : false,
    "extractOtherLinks" : false,
    "csvParsing" : false,
    "skip" : false,
    "label" : "tika",
    "sourceField" : "_raw_content_"
    }, {
    "type" : "xml-transformation",
    "rootXPath" : "/ROOTS/ROOT",
    "bodyField" : "body",
    "mappings" : [ {
        "xpath" : "/ROOTS/ROOT/element",
        "field" : "element-field_t",
        "multivalue" : false
    } ],
    "keepParent" : false,
    "skip" : false,
    "label" : "xml"
    }, {
    "type" : "field-mapping",
    "mappings" : [ {
        "source" : "parsing",
        "operation" : "delete"
    }, {
        "source" : "parsing_time",
        "operation" : "delete"
    }, {
        "source" : "Content-Type",
        "operation" : "delete"
    }, {
        "source" : "Content-Length",
        "operation" : "delete"
    } ],
    "skip" : false,
    "label" : "field mapping"
    }, {
    "type" : "solr-index",
    "enforceSchema" : true,
    "bufferDocsForSolr" : false,
    "skip" : false,
    "label" : "solr-index"
    } ]
}
```

## Configuration

<Tip>
  When entering configuration values in the UI, use *unescaped* characters, such as `\t` for the tab character. When entering configuration values in the API, use *escaped* characters, such as `\\t` for the tab character.
</Tip>

<SchemaParamFields schema={schema} />