import requests
url = "https://{FUSION HOST}/async-parsing/_schema/parsers"
response = requests.get(url)
print(response.text){
"type": "object",
"title": "Parser Configuration",
"description": "List of parser stages to use for handling incoming streams",
"required": [
"id"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "182d49be-fc4b-4afe-9657-4bbaef34569d",
"maxLength": 128,
"pattern": "^[A-Za-z0-9_\\-]+$"
},
"idField": {
"type": "string",
"title": "Document ID Source Field",
"description": "A document field to use as the document ID"
},
"enableMediaTypeDetection": {
"type": "boolean",
"title": "Enable automatic media type detection",
"description": "Automatically detect the Content-Type of each document; disable this to use `application/octet-stream`.",
"default": true
},
"detectMediaTypeBasedOnExtension": {
"type": "boolean",
"title": "Detect media type based on extension",
"description": "Use file extension to detect Content-Type of a document before attempting to detect type based on content.",
"default": true
},
"maxParserDepth": {
"type": "integer",
"title": "Maximum Parser Recursion Depth",
"description": "Maximum number of times a parser may recurse over any document before proceeding to the next parser.",
"default": 16,
"minimum": 0,
"exclusiveMinimum": false
},
"maxFieldLength": {
"type": "integer",
"title": "Maximum Document Field Length",
"description": "Maximum allowed document field length in bytes. Field values exceeding this limit will be truncated.",
"default": 1048576,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -1,
"exclusiveMinimum": false
},
"parserStages": {
"type": "array",
"items": {
"type": "object",
"properties": {},
"oneOf": [
{
"type": "object",
"title": "Apache Tika Container Parser",
"description": "Parse documents using the tika-server container 'only when async-parsing is configured.' This parser is a wrapper around the tika-server REST API. It sends the document to the tika-server container and receives the parsed content.",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "c180a1f1-f4aa-4193-8b55-55367c971f18"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"includeImages": {
"type": "boolean",
"title": "Include images",
"default": false
},
"excludeContentTypes": {
"type": "array",
"title": "Content types to exclude",
"description": "List of content types to exclude from parsing",
"items": {
"type": "string",
"minLength": 1
}
},
"embeddedDocumentHandling": {
"type": "string",
"title": "Embedded document handling",
"description": "Controls the handling of embedded documents: generate a different one each time, merge all in a single document or skip embedded documents",
"enum": [
"split_documents",
"merge_documents",
"skip_embedded_documents"
],
"default": "split_documents"
},
"addImageOriginalContent": {
"type": "boolean",
"title": "Add original image content (raw bytes)",
"description": "For images only. When true, the original image content is added to the document. Default is false.",
"default": false
},
"type": {
"type": "string",
"enum": [
"tika-container"
],
"default": "tika-container"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Solr Update",
"description": "Parser for Solr \"update\" messages (xml, json, csv and javabin).",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "467a05c9-035d-4db4-8288-b63b814ac016"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"enableCsv": {
"type": "boolean",
"title": "Enable CSV",
"description": "Enables the parser to recognize and parse, CSV based Solr update messages.",
"default": true
},
"enableXml": {
"type": "boolean",
"title": "Enable XML",
"description": "Enables the parser to recognize and parse, XML based Solr update messages.",
"default": true
},
"enableJson": {
"type": "boolean",
"title": "Enable JSON",
"description": "Enables the parser to recognize and parse, JSON based Solr update messages.",
"default": true
},
"type": {
"type": "string",
"enum": [
"solr-update"
],
"default": "solr-update"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "XML",
"description": "Parse xml content with optional splitting",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "bb871790-9481-4372-adb3-3962faa132b3"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"rootPaths": {
"type": "array",
"title": "Root paths",
"description": "Read XML elements that can be found on specified XML paths and parse them into separate documents",
"default": [
"/"
],
"items": {
"type": "string"
}
},
"maxSize": {
"type": "integer",
"title": "Maximum output size for each document",
"description": "Maximum number of XML characters, excluding extra whitespace, that will be processed from each source document node to produce an output document",
"default": 65536
},
"listHandling": {
"type": "string",
"title": "XML List handling",
"description": "Create a single multivalued field containing all items, or a separate index-numbered field per list item?",
"enum": [
"multivalued",
"index_numbered"
],
"default": "multivalued",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"enum": [
"xml"
],
"default": "xml"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Grok",
"description": "Parses semi structured content using Grok patterns (like Regex, see https://github.com/thekrakken/java-grok). This is often ideal for understanding log files, but can be used for other purposes.",
"required": [
"charset",
"ignoreBOM",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "59174804-dcb2-4d9e-86bf-92762866e44a"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"charset": {
"type": "string",
"title": "Character Set",
"description": "Example: \"UTF-8\"",
"default": "detect"
},
"ignoreBOM": {
"type": "boolean",
"title": "Ignore BOM",
"description": "Ignore Byte-Order Mark (BOM) if present and always use the configured character set. When set to false a valid BOM character set overrides the configured default character set.",
"default": false
},
"grokDefinition": {
"type": "string",
"title": "Grok Definition",
"description": "Custom Grok definition",
"hints": [
"code/javascript"
]
},
"grokPattern": {
"type": "string",
"title": "Grok Pattern",
"description": "Grok parsing pattern",
"hints": [
"code/javascript"
]
},
"type": {
"type": "string",
"enum": [
"grok"
],
"default": "grok"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "CSV",
"description": "Parse CSV content",
"required": [
"charset",
"ignoreBOM",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "2982cdab-c07e-4a3d-ae9c-f774f48a40c6"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"charset": {
"type": "string",
"title": "Character Set",
"description": "Example: \"UTF-8\"",
"default": "detect"
},
"ignoreBOM": {
"type": "boolean",
"title": "Ignore BOM",
"description": "Ignore Byte-Order Mark (BOM) if present and always use the configured character set. When set to false a valid BOM character set overrides the configured default character set.",
"default": false
},
"delimiter": {
"type": "string",
"title": "Delimiter",
"description": "Delimiter character between fields. Any single character, including an escaped character, is valid, e.g. , (comma), \\t (tab), or | (pipe). Default is comma if auto-detection is disabled",
"minLength": 1
},
"quote": {
"type": "string",
"title": "Quote",
"description": "Quote character, default is a double quote (\") if auto-detection is disabled",
"maxLength": 1
},
"quoteEscape": {
"type": "string",
"title": "Quote escape",
"description": "Quote escape character, default is a double quote (\") if auto-detection is disabled ",
"maxLength": 1
},
"autoDetect": {
"type": "boolean",
"title": "Auto-detect CSV Format",
"description": "Attempt to guess the delimiter, quote, quote escape, and comment characters",
"default": true
},
"trimWhitespace": {
"type": "boolean",
"title": "Trim whitespace",
"description": "Trim off leading and trailing whitespace from columns, default true",
"default": true
},
"hasHeaders": {
"type": "boolean",
"title": "Headers in file",
"description": "Treat the first row as column headers, default true",
"default": true
},
"headers": {
"type": "array",
"title": "Header list",
"description": "List of column headers, overrides file headers if present",
"items": {
"type": "string"
}
},
"skipEmptyLines": {
"type": "boolean",
"title": "Skip empty lines",
"description": "Skip any empty lines encountered, default true",
"default": true
},
"lineSeparator": {
"type": "string",
"title": "Line Separator",
"description": "Line separator character",
"minLength": 1
},
"nullValue": {
"type": "string",
"title": "Null value",
"description": "A string value to replace nulls with, no default"
},
"emptyValue": {
"type": "string",
"title": "Empty string replacement",
"description": "A string value to replace empty strings with, no default"
},
"includeRowNumber": {
"type": "boolean",
"title": "Include row number",
"description": "Include the row number (line number) in the emitted documents, default true",
"default": true
},
"comment": {
"type": "string",
"title": "Comment character",
"description": "Character at start of row to indicate a comment, default is hash (#) if auto-detection is disabled",
"maxLength": 1
},
"commentHandling": {
"type": "string",
"title": "Comment Handling",
"description": "How to handle comments: ignore, add as field to next document, or add a separate documents, default ignore",
"enum": [
"ignore",
"as_field",
"as_document"
],
"default": "ignore"
},
"maxRowLength": {
"type": "integer",
"title": "Maximum line length",
"description": "Maximum number of characters to allow for a single read line, default 10MB",
"default": 10485760,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"maxNumColumns": {
"type": "integer",
"title": "Maximum number of columns",
"description": "Maximum number of columns to allow for a single row, default 1000",
"default": 1000,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"maxColumnChars": {
"type": "integer",
"title": "Maximum number or characters per column",
"description": "Maximum number of characters a single column value can have, default 10MB",
"default": 10485760,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"columnHandling": {
"type": "string",
"title": "Column mismatch handling",
"description": "What to do when a row has too many or too few columns: Can throw an error, align the column, or do nothing special (default)",
"enum": [
"error",
"align",
"default"
],
"default": "default"
},
"fillValue": {
"type": "string",
"title": "Column fill value",
"description": "A string value to use when aligning the columns (when Column Mismatch Handling is \"align\")",
"default": "<FILL>"
},
"type": {
"type": "string",
"enum": [
"csv"
],
"default": "csv"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "JSON",
"description": "Parses JSON documents with optional splitting and mappings",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "4dc1da64-64f3-4b7c-93d3-07b46df3f864"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"rootPath": {
"type": "string",
"title": "Root path",
"description": "Use only children of this JSON pointer."
},
"includePath": {
"type": "boolean",
"title": "Include root path",
"description": "Include parent element names when using a root path.",
"default": false
},
"splitArrays": {
"type": "boolean",
"title": "Split arrays",
"description": "First split top-level arrays into multiple documents, and then apply other rules.",
"default": true
},
"expectJsonL": {
"type": "boolean",
"title": "Expect JSONL",
"description": "Expect the input to contain multiple line separated JSON documents",
"default": false
},
"maxLineSize": {
"type": "integer",
"title": "Max line size",
"description": "Set maximum size of a line in bytes. This is important for processing JsonL with large documents.",
"default": 8192
},
"mappings": {
"type": "array",
"title": "Mapping rules",
"description": "Extract parts of the document into specified fields",
"items": {
"type": "object",
"required": [
"path",
"target"
],
"properties": {
"path": {
"type": "string",
"title": "JSONPath expression"
},
"target": {
"type": "string",
"title": "Target field"
}
}
}
},
"listHandling": {
"type": "string",
"title": "JSON List handling",
"description": "Create a single multivalued field containing all items, or a separate index-numbered field per list item?",
"enum": [
"multivalued",
"index_numbered"
],
"default": "multivalued",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"enum": [
"json"
],
"default": "json"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Archive",
"description": "Decompress and extract common archive and compression formats, e.g. zip, tar, 7z, GZip, BZip2, etc",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "600ded51-7210-46e4-bfc4-10b12306565f"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"alwaysDetect": {
"type": "boolean",
"title": "Always detect type",
"description": "Forces content-type detection. Most compression and archive formats use a magic byte to indicate their type. This can be more reliable than user input.",
"default": true
},
"type": {
"type": "string",
"enum": [
"archive"
],
"default": "archive"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "HTML",
"description": "Parse html content",
"required": [
"charset",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "c49468d4-7ecf-408f-b1d8-bc2b6e04ea85"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"charset": {
"type": "string",
"title": "Character Set",
"description": "Example: \"UTF-8\"",
"default": "detect"
},
"recordSelector": {
"type": "string",
"title": "Record Selector"
},
"keepParent": {
"type": "boolean",
"title": "Keep Parent Document?",
"description": "Keep or discard parent document with selected records. Has no effect if Record Selector is not specified.",
"default": true
},
"excludeFilters": {
"type": "array",
"title": "Exclude filters",
"description": "Jsoup-formatted selectors for elements to exclude from the HTML document.",
"items": {
"type": "string"
}
},
"filterBeforeMapping": {
"type": "boolean",
"title": "Filter before mapping",
"description": "Apply exclude filters before performing HTML field mapping.",
"default": false
},
"filterBeforeExtractingLinks": {
"type": "boolean",
"title": "Filter before extracting links",
"description": "Apply exclude filters before performing link extraction.",
"default": false
},
"mappings": {
"type": "array",
"title": "HTML Element Mappings",
"items": {
"type": "object",
"required": [
"selectRule",
"field"
],
"properties": {
"selectRule": {
"type": "string",
"title": "Select Rule",
"description": "A jsoup selection rule, for example 'div#foo' to select '<div id=\"foo\">...</div>'"
},
"attribute": {
"type": "string",
"title": "Attribute to map",
"description": "What attribute of the selected element to map. For example 'href' to get the link URL from an '<a>' tag. Special values are '.outerText', '.html', '.outerHtml' and '.data'. If left blank, the text within this element will be mapped."
},
"field": {
"type": "string",
"title": "Target Field",
"description": "The field in which to save the mapped element"
},
"multivalued": {
"type": "boolean",
"title": "Multi-valued",
"description": "Set to true to map multiple elements if there is more than one match for the select rule",
"default": false
}
}
}
},
"extractHtmlLinks": {
"type": "boolean",
"title": "Extract HTML links",
"description": "Collect links explicitly declared in HTML document",
"default": false
},
"extractBodyText": {
"type": "boolean",
"title": "Extract body as a text",
"description": "Extract body as a text",
"default": true
},
"contentExtractionConfig": {
"type": "object",
"title": "Content Extraction (Experimental)",
"description": "Attempt to extract the content that matters using advanced heuristics. Might not work for your website.",
"properties": {
"extractContent": {
"type": "boolean",
"title": "Extract page content",
"default": true
},
"extractMetadata": {
"type": "boolean",
"title": "Extract metadata",
"default": true
},
"preserveContentFormat": {
"type": "boolean",
"title": "Store readable html version of body content",
"default": false
},
"metadataPrefix": {
"type": "string",
"title": "Optional prefix for rich content and extracted metadata"
}
}
},
"metatagsPrefix": {
"type": "string",
"title": "Optional prefix for metatags captured html document",
"description": "Optional prefix to add to metatags (except id and charset) captured from an html document to used as a fields as-is (id and charset get prefix) from document",
"default": "false"
},
"type": {
"type": "string",
"enum": [
"html"
],
"default": "html"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Text",
"description": "Parses plain text content with optional trimming and splitting. Character encoding can be specified or automatically detected",
"required": [
"charset",
"ignoreBOM",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "bb65fd64-7156-4951-aeb8-11de68bde8f1"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"charset": {
"type": "string",
"title": "Character Set",
"description": "Example: \"UTF-8\"",
"default": "detect"
},
"ignoreBOM": {
"type": "boolean",
"title": "Ignore BOM",
"description": "Ignore Byte-Order Mark (BOM) if present and always use the configured character set. When set to false a valid BOM character set overrides the configured default character set.",
"default": false
},
"splitLines": {
"type": "boolean",
"title": "Split lines",
"description": "Split text into lines to create multiple records, default false",
"default": false
},
"skipHeaderLines": {
"type": "integer",
"title": "Skip header lines",
"description": "Skip a number of header lines, default 0",
"default": 0
},
"trimWhitespace": {
"type": "boolean",
"title": "Trim whitespace",
"description": "Trim off leading and trailing whitespace from lines, default false",
"default": false
},
"skipEmptyLines": {
"type": "boolean",
"title": "Skip empty lines",
"description": "Skip any empty lines encountered, default false",
"default": false
},
"outputField": {
"type": "string",
"title": "Output field",
"description": "Name of the output field where text is stored, default 'body'",
"default": "body",
"minLength": 1
},
"maxLength": {
"type": "integer",
"title": "Maximum length",
"description": "Maximum number of characters to allow for the body, -1 for unlimited, default 1MB",
"default": 1048576,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"maxLineLength": {
"type": "integer",
"title": "Maximum line length",
"description": "Maximum number of characters to allow for any single line, default 1MB",
"default": 1048576,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"commentField": {
"type": "string",
"title": "Comment field",
"description": "Name of the output field where comment is stored, default 'comment'",
"default": "comment",
"minLength": 1
},
"comment": {
"type": "string",
"title": "Comment character",
"description": "Characters at start of line to indicate a comment, default # (hash)",
"default": "#",
"minLength": 1
},
"commentHandling": {
"type": "string",
"title": "Comment Handling",
"description": "How to handle comments: include as-is, ignore (and remove from text), add as field (and remove from text), default include",
"enum": [
"ignore",
"include",
"as_field"
],
"default": "include"
},
"type": {
"type": "string",
"enum": [
"text"
],
"default": "text"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Fallback",
"description": "If no previous parser stage was able to handle the stream, the fallback parser will copy the data into the _raw_content_ field for later parsing and analysis.",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "89d78ae5-048e-49f3-891e-5910e9d0fafc"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"metadataOnly": {
"type": "boolean",
"title": "Only parse metadata",
"default": false
},
"maxBytesToKeep": {
"type": "integer",
"title": "Maximum bytes to keep",
"default": 1048576
},
"type": {
"type": "string",
"enum": [
"fallback"
],
"default": "fallback"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Apache Tika (Deprecated) ",
"description": "Parse Office documents (ppt/docx/pdf), HTML files, images (jpeg/tiff), and more. See \"Supported Formats\" at https://tika.apache.org/ for a full list. This stage is deprecated. Use 'Apache Tika Container Parser' instead. This stage doesn't work with async-parsing.",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "6c992ac7-d6ee-469a-af03-a6d0ed26638f"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"includeImages": {
"type": "boolean",
"title": "Include images",
"default": false
},
"flattenCompound": {
"type": "boolean",
"title": "Flatten compound documents",
"default": false
},
"addFailedDocs": {
"type": "boolean",
"title": "Add failed documents",
"default": false
},
"addOriginalContent": {
"type": "boolean",
"title": "Add original document content (raw bytes)",
"default": false
},
"contentEncoding": {
"type": "string",
"title": "Content transport encoding of the content (per RFC1341)",
"enum": [
"binary",
"base64"
],
"default": "binary"
},
"returnXml": {
"type": "boolean",
"title": "Return parsed content as XML",
"default": false
},
"keepOriginalStructure": {
"type": "boolean",
"title": "Return original XML and HTML instead of Tika XML output (only applies if 'Return parsed content as XML is true')",
"default": false
},
"extractHtmlLinks": {
"type": "boolean",
"title": "Extract XHTML links",
"description": "Collect links explicitly declared in document structure (e.g. using HTML tags, bookmarks, etc)",
"default": true
},
"extractOtherLinks": {
"type": "boolean",
"title": "Extract other links",
"description": "Use regex-based heuristic extractor to collect likely links from plain text content in all fields.",
"default": false
},
"excludeContentTypes": {
"type": "array",
"title": "Content types to exclude",
"description": "List of content types to exclude from parsing",
"items": {
"type": "string",
"minLength": 1
}
},
"zipBombCompressionRatio": {
"type": "integer",
"title": "Maximum input-to-output byte ratio",
"description": "Maximum number of output bytes fusion will generate per input byte. If you are indexing highly compressed files, you may increase this value to avoid triggering 'Zip Bomb' detection",
"default": 200
},
"zipBombMaxDepth": {
"type": "integer",
"title": "Maximum nesting depth",
"description": "Returns the maximum XML element nesting level. If you are indexing highly nested files, you may increase this value to avoid triggering 'Zip Bomb' detection",
"default": 200
},
"zipBombMaxPackageEntryDepth": {
"type": "integer",
"title": "Maximum package entry depth",
"description": "Sets the maximum package entry nesting level. If you are indexing highly nested files, you may increase this value to avoid triggering 'Zip Bomb' detection",
"default": 20
},
"type": {
"type": "string",
"enum": [
"tika"
],
"default": "tika"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
}
]
}
},
"optionalTargetPipelineId": {
"type": "string",
"title": "Target Pipeline ID",
"description": "Optional parameter. This property should be used only when parsing documents from connectors and whenthe async-parsing is enabled. When set, the documents produced in the asynchronous parsing process will be sent to the specified pipeline. If not set, the documents will be sent to the pipeline that was configured in the datasource configuration."
},
"hidden": {
"type": "boolean",
"title": "Hidden",
"description": "Objects marked as hidden will only be returned in the API with hidden=true",
"hints": [
"hidden"
]
}
},
"category": "Other",
"categoryPriority": 1,
"unsafe": false
}Fetch the JSON schema for parsers API.
import requests
url = "https://{FUSION HOST}/async-parsing/_schema/parsers"
response = requests.get(url)
print(response.text){
"type": "object",
"title": "Parser Configuration",
"description": "List of parser stages to use for handling incoming streams",
"required": [
"id"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "182d49be-fc4b-4afe-9657-4bbaef34569d",
"maxLength": 128,
"pattern": "^[A-Za-z0-9_\\-]+$"
},
"idField": {
"type": "string",
"title": "Document ID Source Field",
"description": "A document field to use as the document ID"
},
"enableMediaTypeDetection": {
"type": "boolean",
"title": "Enable automatic media type detection",
"description": "Automatically detect the Content-Type of each document; disable this to use `application/octet-stream`.",
"default": true
},
"detectMediaTypeBasedOnExtension": {
"type": "boolean",
"title": "Detect media type based on extension",
"description": "Use file extension to detect Content-Type of a document before attempting to detect type based on content.",
"default": true
},
"maxParserDepth": {
"type": "integer",
"title": "Maximum Parser Recursion Depth",
"description": "Maximum number of times a parser may recurse over any document before proceeding to the next parser.",
"default": 16,
"minimum": 0,
"exclusiveMinimum": false
},
"maxFieldLength": {
"type": "integer",
"title": "Maximum Document Field Length",
"description": "Maximum allowed document field length in bytes. Field values exceeding this limit will be truncated.",
"default": 1048576,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -1,
"exclusiveMinimum": false
},
"parserStages": {
"type": "array",
"items": {
"type": "object",
"properties": {},
"oneOf": [
{
"type": "object",
"title": "Apache Tika Container Parser",
"description": "Parse documents using the tika-server container 'only when async-parsing is configured.' This parser is a wrapper around the tika-server REST API. It sends the document to the tika-server container and receives the parsed content.",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "c180a1f1-f4aa-4193-8b55-55367c971f18"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"includeImages": {
"type": "boolean",
"title": "Include images",
"default": false
},
"excludeContentTypes": {
"type": "array",
"title": "Content types to exclude",
"description": "List of content types to exclude from parsing",
"items": {
"type": "string",
"minLength": 1
}
},
"embeddedDocumentHandling": {
"type": "string",
"title": "Embedded document handling",
"description": "Controls the handling of embedded documents: generate a different one each time, merge all in a single document or skip embedded documents",
"enum": [
"split_documents",
"merge_documents",
"skip_embedded_documents"
],
"default": "split_documents"
},
"addImageOriginalContent": {
"type": "boolean",
"title": "Add original image content (raw bytes)",
"description": "For images only. When true, the original image content is added to the document. Default is false.",
"default": false
},
"type": {
"type": "string",
"enum": [
"tika-container"
],
"default": "tika-container"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Solr Update",
"description": "Parser for Solr \"update\" messages (xml, json, csv and javabin).",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "467a05c9-035d-4db4-8288-b63b814ac016"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"enableCsv": {
"type": "boolean",
"title": "Enable CSV",
"description": "Enables the parser to recognize and parse, CSV based Solr update messages.",
"default": true
},
"enableXml": {
"type": "boolean",
"title": "Enable XML",
"description": "Enables the parser to recognize and parse, XML based Solr update messages.",
"default": true
},
"enableJson": {
"type": "boolean",
"title": "Enable JSON",
"description": "Enables the parser to recognize and parse, JSON based Solr update messages.",
"default": true
},
"type": {
"type": "string",
"enum": [
"solr-update"
],
"default": "solr-update"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "XML",
"description": "Parse xml content with optional splitting",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "bb871790-9481-4372-adb3-3962faa132b3"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"rootPaths": {
"type": "array",
"title": "Root paths",
"description": "Read XML elements that can be found on specified XML paths and parse them into separate documents",
"default": [
"/"
],
"items": {
"type": "string"
}
},
"maxSize": {
"type": "integer",
"title": "Maximum output size for each document",
"description": "Maximum number of XML characters, excluding extra whitespace, that will be processed from each source document node to produce an output document",
"default": 65536
},
"listHandling": {
"type": "string",
"title": "XML List handling",
"description": "Create a single multivalued field containing all items, or a separate index-numbered field per list item?",
"enum": [
"multivalued",
"index_numbered"
],
"default": "multivalued",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"enum": [
"xml"
],
"default": "xml"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Grok",
"description": "Parses semi structured content using Grok patterns (like Regex, see https://github.com/thekrakken/java-grok). This is often ideal for understanding log files, but can be used for other purposes.",
"required": [
"charset",
"ignoreBOM",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "59174804-dcb2-4d9e-86bf-92762866e44a"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"charset": {
"type": "string",
"title": "Character Set",
"description": "Example: \"UTF-8\"",
"default": "detect"
},
"ignoreBOM": {
"type": "boolean",
"title": "Ignore BOM",
"description": "Ignore Byte-Order Mark (BOM) if present and always use the configured character set. When set to false a valid BOM character set overrides the configured default character set.",
"default": false
},
"grokDefinition": {
"type": "string",
"title": "Grok Definition",
"description": "Custom Grok definition",
"hints": [
"code/javascript"
]
},
"grokPattern": {
"type": "string",
"title": "Grok Pattern",
"description": "Grok parsing pattern",
"hints": [
"code/javascript"
]
},
"type": {
"type": "string",
"enum": [
"grok"
],
"default": "grok"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "CSV",
"description": "Parse CSV content",
"required": [
"charset",
"ignoreBOM",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "2982cdab-c07e-4a3d-ae9c-f774f48a40c6"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"charset": {
"type": "string",
"title": "Character Set",
"description": "Example: \"UTF-8\"",
"default": "detect"
},
"ignoreBOM": {
"type": "boolean",
"title": "Ignore BOM",
"description": "Ignore Byte-Order Mark (BOM) if present and always use the configured character set. When set to false a valid BOM character set overrides the configured default character set.",
"default": false
},
"delimiter": {
"type": "string",
"title": "Delimiter",
"description": "Delimiter character between fields. Any single character, including an escaped character, is valid, e.g. , (comma), \\t (tab), or | (pipe). Default is comma if auto-detection is disabled",
"minLength": 1
},
"quote": {
"type": "string",
"title": "Quote",
"description": "Quote character, default is a double quote (\") if auto-detection is disabled",
"maxLength": 1
},
"quoteEscape": {
"type": "string",
"title": "Quote escape",
"description": "Quote escape character, default is a double quote (\") if auto-detection is disabled ",
"maxLength": 1
},
"autoDetect": {
"type": "boolean",
"title": "Auto-detect CSV Format",
"description": "Attempt to guess the delimiter, quote, quote escape, and comment characters",
"default": true
},
"trimWhitespace": {
"type": "boolean",
"title": "Trim whitespace",
"description": "Trim off leading and trailing whitespace from columns, default true",
"default": true
},
"hasHeaders": {
"type": "boolean",
"title": "Headers in file",
"description": "Treat the first row as column headers, default true",
"default": true
},
"headers": {
"type": "array",
"title": "Header list",
"description": "List of column headers, overrides file headers if present",
"items": {
"type": "string"
}
},
"skipEmptyLines": {
"type": "boolean",
"title": "Skip empty lines",
"description": "Skip any empty lines encountered, default true",
"default": true
},
"lineSeparator": {
"type": "string",
"title": "Line Separator",
"description": "Line separator character",
"minLength": 1
},
"nullValue": {
"type": "string",
"title": "Null value",
"description": "A string value to replace nulls with, no default"
},
"emptyValue": {
"type": "string",
"title": "Empty string replacement",
"description": "A string value to replace empty strings with, no default"
},
"includeRowNumber": {
"type": "boolean",
"title": "Include row number",
"description": "Include the row number (line number) in the emitted documents, default true",
"default": true
},
"comment": {
"type": "string",
"title": "Comment character",
"description": "Character at start of row to indicate a comment, default is hash (#) if auto-detection is disabled",
"maxLength": 1
},
"commentHandling": {
"type": "string",
"title": "Comment Handling",
"description": "How to handle comments: ignore, add as field to next document, or add a separate documents, default ignore",
"enum": [
"ignore",
"as_field",
"as_document"
],
"default": "ignore"
},
"maxRowLength": {
"type": "integer",
"title": "Maximum line length",
"description": "Maximum number of characters to allow for a single read line, default 10MB",
"default": 10485760,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"maxNumColumns": {
"type": "integer",
"title": "Maximum number of columns",
"description": "Maximum number of columns to allow for a single row, default 1000",
"default": 1000,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"maxColumnChars": {
"type": "integer",
"title": "Maximum number or characters per column",
"description": "Maximum number of characters a single column value can have, default 10MB",
"default": 10485760,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"columnHandling": {
"type": "string",
"title": "Column mismatch handling",
"description": "What to do when a row has too many or too few columns: Can throw an error, align the column, or do nothing special (default)",
"enum": [
"error",
"align",
"default"
],
"default": "default"
},
"fillValue": {
"type": "string",
"title": "Column fill value",
"description": "A string value to use when aligning the columns (when Column Mismatch Handling is \"align\")",
"default": "<FILL>"
},
"type": {
"type": "string",
"enum": [
"csv"
],
"default": "csv"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "JSON",
"description": "Parses JSON documents with optional splitting and mappings",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "4dc1da64-64f3-4b7c-93d3-07b46df3f864"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"rootPath": {
"type": "string",
"title": "Root path",
"description": "Use only children of this JSON pointer."
},
"includePath": {
"type": "boolean",
"title": "Include root path",
"description": "Include parent element names when using a root path.",
"default": false
},
"splitArrays": {
"type": "boolean",
"title": "Split arrays",
"description": "First split top-level arrays into multiple documents, and then apply other rules.",
"default": true
},
"expectJsonL": {
"type": "boolean",
"title": "Expect JSONL",
"description": "Expect the input to contain multiple line separated JSON documents",
"default": false
},
"maxLineSize": {
"type": "integer",
"title": "Max line size",
"description": "Set maximum size of a line in bytes. This is important for processing JsonL with large documents.",
"default": 8192
},
"mappings": {
"type": "array",
"title": "Mapping rules",
"description": "Extract parts of the document into specified fields",
"items": {
"type": "object",
"required": [
"path",
"target"
],
"properties": {
"path": {
"type": "string",
"title": "JSONPath expression"
},
"target": {
"type": "string",
"title": "Target field"
}
}
}
},
"listHandling": {
"type": "string",
"title": "JSON List handling",
"description": "Create a single multivalued field containing all items, or a separate index-numbered field per list item?",
"enum": [
"multivalued",
"index_numbered"
],
"default": "multivalued",
"hints": [
"advanced"
]
},
"type": {
"type": "string",
"enum": [
"json"
],
"default": "json"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Archive",
"description": "Decompress and extract common archive and compression formats, e.g. zip, tar, 7z, GZip, BZip2, etc",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "600ded51-7210-46e4-bfc4-10b12306565f"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"alwaysDetect": {
"type": "boolean",
"title": "Always detect type",
"description": "Forces content-type detection. Most compression and archive formats use a magic byte to indicate their type. This can be more reliable than user input.",
"default": true
},
"type": {
"type": "string",
"enum": [
"archive"
],
"default": "archive"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "HTML",
"description": "Parse html content",
"required": [
"charset",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "c49468d4-7ecf-408f-b1d8-bc2b6e04ea85"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"charset": {
"type": "string",
"title": "Character Set",
"description": "Example: \"UTF-8\"",
"default": "detect"
},
"recordSelector": {
"type": "string",
"title": "Record Selector"
},
"keepParent": {
"type": "boolean",
"title": "Keep Parent Document?",
"description": "Keep or discard parent document with selected records. Has no effect if Record Selector is not specified.",
"default": true
},
"excludeFilters": {
"type": "array",
"title": "Exclude filters",
"description": "Jsoup-formatted selectors for elements to exclude from the HTML document.",
"items": {
"type": "string"
}
},
"filterBeforeMapping": {
"type": "boolean",
"title": "Filter before mapping",
"description": "Apply exclude filters before performing HTML field mapping.",
"default": false
},
"filterBeforeExtractingLinks": {
"type": "boolean",
"title": "Filter before extracting links",
"description": "Apply exclude filters before performing link extraction.",
"default": false
},
"mappings": {
"type": "array",
"title": "HTML Element Mappings",
"items": {
"type": "object",
"required": [
"selectRule",
"field"
],
"properties": {
"selectRule": {
"type": "string",
"title": "Select Rule",
"description": "A jsoup selection rule, for example 'div#foo' to select '<div id=\"foo\">...</div>'"
},
"attribute": {
"type": "string",
"title": "Attribute to map",
"description": "What attribute of the selected element to map. For example 'href' to get the link URL from an '<a>' tag. Special values are '.outerText', '.html', '.outerHtml' and '.data'. If left blank, the text within this element will be mapped."
},
"field": {
"type": "string",
"title": "Target Field",
"description": "The field in which to save the mapped element"
},
"multivalued": {
"type": "boolean",
"title": "Multi-valued",
"description": "Set to true to map multiple elements if there is more than one match for the select rule",
"default": false
}
}
}
},
"extractHtmlLinks": {
"type": "boolean",
"title": "Extract HTML links",
"description": "Collect links explicitly declared in HTML document",
"default": false
},
"extractBodyText": {
"type": "boolean",
"title": "Extract body as a text",
"description": "Extract body as a text",
"default": true
},
"contentExtractionConfig": {
"type": "object",
"title": "Content Extraction (Experimental)",
"description": "Attempt to extract the content that matters using advanced heuristics. Might not work for your website.",
"properties": {
"extractContent": {
"type": "boolean",
"title": "Extract page content",
"default": true
},
"extractMetadata": {
"type": "boolean",
"title": "Extract metadata",
"default": true
},
"preserveContentFormat": {
"type": "boolean",
"title": "Store readable html version of body content",
"default": false
},
"metadataPrefix": {
"type": "string",
"title": "Optional prefix for rich content and extracted metadata"
}
}
},
"metatagsPrefix": {
"type": "string",
"title": "Optional prefix for metatags captured html document",
"description": "Optional prefix to add to metatags (except id and charset) captured from an html document to used as a fields as-is (id and charset get prefix) from document",
"default": "false"
},
"type": {
"type": "string",
"enum": [
"html"
],
"default": "html"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Text",
"description": "Parses plain text content with optional trimming and splitting. Character encoding can be specified or automatically detected",
"required": [
"charset",
"ignoreBOM",
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "bb65fd64-7156-4951-aeb8-11de68bde8f1"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"charset": {
"type": "string",
"title": "Character Set",
"description": "Example: \"UTF-8\"",
"default": "detect"
},
"ignoreBOM": {
"type": "boolean",
"title": "Ignore BOM",
"description": "Ignore Byte-Order Mark (BOM) if present and always use the configured character set. When set to false a valid BOM character set overrides the configured default character set.",
"default": false
},
"splitLines": {
"type": "boolean",
"title": "Split lines",
"description": "Split text into lines to create multiple records, default false",
"default": false
},
"skipHeaderLines": {
"type": "integer",
"title": "Skip header lines",
"description": "Skip a number of header lines, default 0",
"default": 0
},
"trimWhitespace": {
"type": "boolean",
"title": "Trim whitespace",
"description": "Trim off leading and trailing whitespace from lines, default false",
"default": false
},
"skipEmptyLines": {
"type": "boolean",
"title": "Skip empty lines",
"description": "Skip any empty lines encountered, default false",
"default": false
},
"outputField": {
"type": "string",
"title": "Output field",
"description": "Name of the output field where text is stored, default 'body'",
"default": "body",
"minLength": 1
},
"maxLength": {
"type": "integer",
"title": "Maximum length",
"description": "Maximum number of characters to allow for the body, -1 for unlimited, default 1MB",
"default": 1048576,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"maxLineLength": {
"type": "integer",
"title": "Maximum line length",
"description": "Maximum number of characters to allow for any single line, default 1MB",
"default": 1048576,
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false
},
"commentField": {
"type": "string",
"title": "Comment field",
"description": "Name of the output field where comment is stored, default 'comment'",
"default": "comment",
"minLength": 1
},
"comment": {
"type": "string",
"title": "Comment character",
"description": "Characters at start of line to indicate a comment, default # (hash)",
"default": "#",
"minLength": 1
},
"commentHandling": {
"type": "string",
"title": "Comment Handling",
"description": "How to handle comments: include as-is, ignore (and remove from text), add as field (and remove from text), default include",
"enum": [
"ignore",
"include",
"as_field"
],
"default": "include"
},
"type": {
"type": "string",
"enum": [
"text"
],
"default": "text"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Fallback",
"description": "If no previous parser stage was able to handle the stream, the fallback parser will copy the data into the _raw_content_ field for later parsing and analysis.",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "89d78ae5-048e-49f3-891e-5910e9d0fafc"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"metadataOnly": {
"type": "boolean",
"title": "Only parse metadata",
"default": false
},
"maxBytesToKeep": {
"type": "integer",
"title": "Maximum bytes to keep",
"default": 1048576
},
"type": {
"type": "string",
"enum": [
"fallback"
],
"default": "fallback"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
},
{
"type": "object",
"title": "Apache Tika (Deprecated) ",
"description": "Parse Office documents (ppt/docx/pdf), HTML files, images (jpeg/tiff), and more. See \"Supported Formats\" at https://tika.apache.org/ for a full list. This stage is deprecated. Use 'Apache Tika Container Parser' instead. This stage doesn't work with async-parsing.",
"required": [
"type"
],
"properties": {
"id": {
"type": "string",
"title": "Parser ID",
"default": "6c992ac7-d6ee-469a-af03-a6d0ed26638f"
},
"label": {
"type": "string",
"title": "Label",
"description": "A label for this Parser Stage",
"maxLength": 255
},
"enabled": {
"type": "boolean",
"title": "Enable this Parser Stage",
"default": true
},
"mediaTypes": {
"type": "array",
"title": "Media Types to match",
"description": "Documents with a media type on this list will be matched by this parser stage. See inheritMediaTypes / use default media types for more.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"inheritMediaTypes": {
"type": "boolean",
"title": "Match default media types in this Parser Stage",
"description": "Each parser stage has a built-in list of media types it handles by default. If this setting is true, that list will be used along with any optional additional types provided in the mediaTypes list. If this setting is false, this stage will only be selected for media types in the mediaTypes list, and the mediaTypes list becomes a mandatory property which must have at least one valid media type.",
"default": true
},
"ignoredMediaTypes": {
"type": "array",
"title": "Media Types to ignore",
"description": "Documents with a media type on this list will be not be processed by this parser stage.",
"items": {
"type": "string",
"pattern": "^[^/]+/[^/]+$",
"format": "rfc2646"
}
},
"pathPatterns": {
"type": "array",
"title": "File names to parse",
"description": "Specify a file name or pattern that must be matched for this parser stage to run. Forward slashes (\"/\") are used to join names of files inside archives with the archive name.",
"items": {
"type": "object",
"properties": {
"syntax": {
"type": "string",
"title": "Pattern type",
"description": "glob uses bash shell-style wildcards; regex uses Java (PCRE-style) regex",
"enum": [
"glob",
"regex"
],
"default": "glob"
},
"pattern": {
"type": "string",
"title": "File name or pattern",
"description": "e.g.: \"z.txt\" or \"*.md\" or \"/a/*/b/f.txt\" for glob; \"z.txt$\" or \".*\\.txt$\" or \"^/a/[^/]*/b/f.txt$\" for regex"
}
}
}
},
"errorHandling": {
"type": "string",
"title": "Error Handling",
"enum": [
"ignore",
"log",
"fail",
"mark"
],
"default": "mark"
},
"outputFieldPrefix": {
"type": "string",
"title": "Prefix parsed fields with",
"description": "Fields extracted by this parser will be prefixed with this string. The remainder of the field name will be as detected in the stream",
"maxLength": 20,
"pattern": "^$|^[A-Za-z_][A-Za-z0-9_\\-\\.]+$"
},
"includeImages": {
"type": "boolean",
"title": "Include images",
"default": false
},
"flattenCompound": {
"type": "boolean",
"title": "Flatten compound documents",
"default": false
},
"addFailedDocs": {
"type": "boolean",
"title": "Add failed documents",
"default": false
},
"addOriginalContent": {
"type": "boolean",
"title": "Add original document content (raw bytes)",
"default": false
},
"contentEncoding": {
"type": "string",
"title": "Content transport encoding of the content (per RFC1341)",
"enum": [
"binary",
"base64"
],
"default": "binary"
},
"returnXml": {
"type": "boolean",
"title": "Return parsed content as XML",
"default": false
},
"keepOriginalStructure": {
"type": "boolean",
"title": "Return original XML and HTML instead of Tika XML output (only applies if 'Return parsed content as XML is true')",
"default": false
},
"extractHtmlLinks": {
"type": "boolean",
"title": "Extract XHTML links",
"description": "Collect links explicitly declared in document structure (e.g. using HTML tags, bookmarks, etc)",
"default": true
},
"extractOtherLinks": {
"type": "boolean",
"title": "Extract other links",
"description": "Use regex-based heuristic extractor to collect likely links from plain text content in all fields.",
"default": false
},
"excludeContentTypes": {
"type": "array",
"title": "Content types to exclude",
"description": "List of content types to exclude from parsing",
"items": {
"type": "string",
"minLength": 1
}
},
"zipBombCompressionRatio": {
"type": "integer",
"title": "Maximum input-to-output byte ratio",
"description": "Maximum number of output bytes fusion will generate per input byte. If you are indexing highly compressed files, you may increase this value to avoid triggering 'Zip Bomb' detection",
"default": 200
},
"zipBombMaxDepth": {
"type": "integer",
"title": "Maximum nesting depth",
"description": "Returns the maximum XML element nesting level. If you are indexing highly nested files, you may increase this value to avoid triggering 'Zip Bomb' detection",
"default": 200
},
"zipBombMaxPackageEntryDepth": {
"type": "integer",
"title": "Maximum package entry depth",
"description": "Sets the maximum package entry nesting level. If you are indexing highly nested files, you may increase this value to avoid triggering 'Zip Bomb' detection",
"default": 20
},
"type": {
"type": "string",
"enum": [
"tika"
],
"default": "tika"
}
},
"additionalProperties": false,
"category": "Other",
"categoryPriority": 1,
"unsafe": false
}
]
}
},
"optionalTargetPipelineId": {
"type": "string",
"title": "Target Pipeline ID",
"description": "Optional parameter. This property should be used only when parsing documents from connectors and whenthe async-parsing is enabled. When set, the documents produced in the asynchronous parsing process will be sent to the specified pipeline. If not set, the documents will be sent to the pipeline that was configured in the datasource configuration."
},
"hidden": {
"type": "boolean",
"title": "Hidden",
"description": "Objects marked as hidden will only be returned in the API with hidden=true",
"hints": [
"hidden"
]
}
},
"category": "Other",
"categoryPriority": 1,
"unsafe": false
}OK
string, number, integer, boolean, object, array, null, ref Show child attributes
Show child attributes
Show child attributes
Show child attributes
Was this page helpful?