curl --request GET \
--url https://{FUSION HOST}/api/connectors/schema/{type}
{
"id": "PpwsTj7f39",
"name": "lucidworks.web-v2",
"pluginVersion": "1.3.0",
"sdkVersion": "4.1.3",
"schema": {
"type": "object",
"title": "Web (v2)",
"description": "Connector for websites and web-based content resources.",
"required": [
"id",
"properties",
"pipelineId"
],
"properties": {
"properties": {
"type": "object",
"title": "Properties",
"description": "Plugin specific properties.",
"required": [],
"properties": {
"startLinks": {
"type": "array",
"title": "Start Links",
"description": "The URL(s) that the crawler will start crawling from, for example: https://en.wikipedia.org/wiki/Main_Page",
"hints": [
"v1LegacyId:startLinks"
],
"items": {
"type": "string"
}
},
"bulkStartLinks": {
"type": "string",
"title": "Bulk start links",
"description": "If a large number of start links must be defined, you can provide them here. One link per line.",
"hints": [
"lengthy"
]
},
"limitDocumentsConfig": {
"type": "object",
"title": "Limit Documents Properties",
"required": [],
"properties": {
"depth": {
"type": "number",
"title": "Max crawling depth",
"description": "Number of levels in a directory or site tree to descend for documents.",
"default": -1,
"hints": [
"v1LegacyId:depth"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"maxItems": {
"type": "number",
"title": "Max items",
"description": "Maximum number of documents to fetch. The default (-1) means no limit.",
"default": -1,
"hints": [
"v1LegacyId:maxItems"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"excludeExtensions": {
"type": "array",
"title": "Excluded file extensions",
"description": "File extensions that should not to be fetched. This will limit this datasource to all extensions except this list.",
"hints": [
"v1LegacyId:excludeExtensions"
],
"items": {
"type": "string"
}
},
"excludeRegexes": {
"type": "array",
"title": "Exclusive regexes",
"description": "Regular expressions for URI patterns to exclude. This will limit this datasource to only URIs that do not match the regular expression.",
"hints": [
"v1LegacyId:excludeRegexes"
],
"items": {
"type": "string"
}
},
"includeExtensions": {
"type": "array",
"title": "Included file extensions",
"description": "File extensions to be fetched. This will limit this datasource to only these file extensions.",
"hints": [
"v1LegacyId:includeExtensions"
],
"items": {
"type": "string"
}
},
"includeRegexes": {
"type": "array",
"title": "Inclusive regexes",
"description": "Regular expressions for URI patterns to include. This will limit this datasource to only URIs that match the regular expression.",
"hints": [
"v1LegacyId:includeRegexes"
],
"items": {
"type": "string"
}
},
"maxSizeBytes": {
"type": "number",
"title": "Max file size (bytes)",
"description": "Maximum size, in bytes, of a document to fetch.",
"default": 20000000,
"hints": [
"v1LegacyId:f.maxSizeBytes"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"indexItemsDiscarded": {
"type": "boolean",
"title": "Index discarded document metadata",
"description": "Enable to index discarded document metadata",
"default": false,
"hints": [
"v1LegacyId:f.index_items_discarded"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.LimitDocumentsConfig"
]
},
"crawlAuthenticationConfig": {
"type": "object",
"title": "Crawl Authentication Properties",
"required": [],
"properties": {
"maintainCookies": {
"type": "boolean",
"title": "Maintain Cookies for No-Auth Crawls",
"description": "If you are not using authentication, then by default cookies are not stored in between web requests (stateless). If checked, cookies will be maintained between requests during the web crawl even when you are not using authentication. If you are using authentication, this checkbox has no effect on the crawl and can be ignored.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.maintainCookies"
]
},
"basicAuth": {
"type": "array",
"title": "Basic Authentication",
"description": "Settings for Basic authentication",
"items": {
"type": "object",
"required": [],
"properties": {
"host": {
"type": "string",
"title": "Host",
"description": "The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)",
"hints": [
"v1LegacyId:host"
]
},
"port": {
"type": "number",
"title": "Port",
"description": "The port. You can specify -1 to authenticate against any port",
"hints": [
"v1LegacyId:port"
],
"maximum": 65535,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false,
"multipleOf": 1
},
"realm": {
"type": "string",
"title": "Realm",
"description": "The realm for the site, if applicable",
"hints": [
"v1LegacyId:realm"
]
},
"username": {
"type": "string",
"title": "Username",
"description": "The username to use for authentication.",
"hints": [
"v1LegacyId:userName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for authentication.",
"hints": [
"v1LegacyId:password",
"secret"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.BasicAuthConfig"
]
}
},
"digestAuth": {
"type": "array",
"title": "Digest Authentication",
"description": "Settings for Digest authentication",
"items": {
"type": "object",
"required": [],
"properties": {
"host": {
"type": "string",
"title": "Host",
"description": "The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)",
"hints": [
"v1LegacyId:host"
]
},
"port": {
"type": "number",
"title": "Port",
"description": "The port. You can specify -1 to authenticate against any port",
"hints": [
"v1LegacyId:port"
],
"maximum": 65535,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false,
"multipleOf": 1
},
"realm": {
"type": "string",
"title": "Realm",
"description": "The realm for the site, if applicable",
"hints": [
"v1LegacyId:realm"
]
},
"username": {
"type": "string",
"title": "Username",
"description": "The username to use for authentication.",
"hints": [
"v1LegacyId:userName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for authentication.",
"hints": [
"v1LegacyId:password",
"secret"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.DigestAuthConfig"
]
}
},
"ntlmAuth": {
"type": "array",
"title": "NTLM Authentication",
"description": "Settings for NTLM authentication",
"items": {
"type": "object",
"required": [],
"properties": {
"domain": {
"type": "string",
"title": "Domain",
"description": "The NTLM Domain",
"hints": [
"v1LegacyId:domain"
]
},
"workstation": {
"type": "string",
"title": "Workstation",
"description": "The NTLM Workstation name",
"hints": [
"v1LegacyId:workstation"
]
},
"host": {
"type": "string",
"title": "Host",
"description": "The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)",
"hints": [
"v1LegacyId:host"
]
},
"port": {
"type": "number",
"title": "Port",
"description": "The port. You can specify -1 to authenticate against any port",
"hints": [
"v1LegacyId:port"
],
"maximum": 65535,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false,
"multipleOf": 1
},
"realm": {
"type": "string",
"title": "Realm",
"description": "The realm for the site, if applicable",
"hints": [
"v1LegacyId:realm"
]
},
"username": {
"type": "string",
"title": "Username",
"description": "The username to use for authentication.",
"hints": [
"v1LegacyId:userName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for authentication.",
"hints": [
"v1LegacyId:password",
"secret"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.NtlmAuthConfig",
"com.lucidworks.connector.plugins.web.config.BasicAuthConfig"
]
}
},
"formAuth": {
"type": "array",
"title": "Form Authentication",
"description": "Settings for Form based authentication",
"items": {
"type": "object",
"required": [],
"properties": {
"action": {
"type": "string",
"title": "URL",
"description": "The URL of the authentication endpoint",
"hints": [
"v1LegacyId:action"
]
},
"ttl": {
"type": "number",
"title": "TTL (ms)",
"description": "The \"time to live\" in milliseconds for the session that will be created after authentication.",
"hints": [
"v1LegacyId:ttl"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"passwordParamName": {
"type": "string",
"title": "Password Parameter",
"description": "Name of the parameter containing the password",
"hints": [
"v1LegacyId:passwordParamName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for the authentication request. This will be copied into the \"Parameters\" using the \"Password Parameter\" name as the key",
"hints": [
"v1LegacyId:password",
"secret"
]
},
"paramsList": {
"type": "array",
"title": "Parameters",
"description": "Parameters sent to the authentication endpoint",
"items": {
"type": "object",
"required": [],
"properties": {
"key": {
"type": "string",
"title": "Parameters Key",
"hints": [
"v1LegacyId:paramsKey"
]
},
"value": {
"type": "string",
"title": "Parameters value",
"hints": [
"v1LegacyId:paramsValue"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.FormAuthConfig$Params"
]
}
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.FormAuthConfig"
]
}
},
"samlAuth": {
"type": "array",
"title": "SAML/Smart Form Authentication",
"description": "Settings for SAML/Smart Form based authentication allows you to visit one or more web pages that contain form inputs such as username, password, security questions, etc., submitting each one in turn in order to become authenticated.",
"items": {
"type": "object",
"required": [],
"properties": {
"action": {
"type": "string",
"title": "URL",
"description": "The URL of the authentication endpoint",
"hints": [
"v1LegacyId:action"
]
},
"ttl": {
"type": "number",
"title": "TTL (ms)",
"description": "The \"time to live\" in milliseconds for the session that will be created after authentication.",
"hints": [
"v1LegacyId:ttl"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"passwordParamName": {
"type": "string",
"title": "Password Parameter",
"description": "Name of the parameter containing the password",
"hints": [
"v1LegacyId:passwordParamName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for the authentication request. This will be copied into the \"Parameters\" using the \"Password Parameter\" name as the key",
"hints": [
"v1LegacyId:password",
"secret"
]
},
"paramsList": {
"type": "array",
"title": "Parameters",
"description": "Parameters sent to the authentication endpoint",
"items": {
"type": "object",
"required": [],
"properties": {
"key": {
"type": "string",
"title": "Parameters Key",
"hints": [
"v1LegacyId:paramsKey"
]
},
"value": {
"type": "string",
"title": "Parameters value",
"hints": [
"v1LegacyId:paramsValue"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.FormAuthConfig$Params"
]
}
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.SamlAuthConfig",
"com.lucidworks.connector.plugins.web.config.FormAuthConfig"
]
}
},
"credentialsFile": {
"type": "string",
"title": "credentialsFile",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.credentialsFile"
]
},
"kerberosEnabled": {
"type": "boolean",
"title": "kerberosEnabled",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"default": false,
"hints": [
"v1LegacyId:f.kerberosEnabled"
]
},
"kerberosLoginContextName": {
"type": "string",
"title": "kerberosLoginContextName",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosLoginContextName"
]
},
"kerberosSpn": {
"type": "string",
"title": "kerberosSpn",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosSpn"
]
},
"kerberosPrincipal": {
"type": "string",
"title": "kerberosPrincipal",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosPrincipal"
]
},
"kerberosKeytabFile": {
"type": "string",
"title": "kerberosKeytabFile",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosKeytabFile"
]
},
"kerberosKeytabBase64": {
"type": "string",
"title": "kerberosKeytabBase64",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosKeytabBase64"
]
},
"kerberosPassword": {
"type": "string",
"title": "kerberosPassword",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosPassword",
"secret"
]
},
"obeyRobots": {
"type": "boolean",
"title": "Obey robots.txt",
"description": "If true, Allow, Disallow and other rules found in a robots.txt file will be obeyed.",
"default": false,
"hints": [
"v1LegacyId:f.obeyRobots"
]
},
"obeyRobotsMeta": {
"type": "boolean",
"title": "Obey robots meta tags and headers",
"description": "If true, rules like 'noindex', 'nofollow' and others found in a robots meta tag on a page or in the headers of the HTTP response are obeyed.",
"default": false,
"hints": [
"v1LegacyId:f.obeyRobotsMeta"
]
},
"obeyLinkNofollow": {
"type": "boolean",
"title": "Obey link nofollow attributes",
"description": "If true, rel='nofollow' on links are obeyed",
"default": false,
"hints": [
"v1LegacyId:f.obeyLinkNofollow"
]
},
"proxy": {
"type": "string",
"title": "HTTP proxy address",
"description": "Address of the HTTP proxy, if required. This should be entered in the format host:port.",
"hints": [
"v1LegacyId:f.proxy"
]
},
"allowAllCertificates": {
"type": "boolean",
"title": "Allow all HTTPS certificates",
"description": "If false, security checks will be performed on all SSL/TLS certificate signers and origins. This means self-signed certificates would not be supported.",
"default": false,
"hints": [
"v1LegacyId:f.allowAllCertificates"
]
},
"useIpAddressForSslConnections": {
"type": "boolean",
"title": "Use IP address for SSL connections",
"description": "Use IP address instead of host name for SSL connections. This is used to work around mis-configured HTTP server throwing 'unrecognized name' error whenSNI is enabled. (This only works if 'Allow all certificates' setting is also enabled)",
"default": false,
"hints": [
"v1LegacyId:f.useIpAddressForSslConnections"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.CrawlAuthenticationConfig"
]
},
"crawlHistoryConfig": {
"type": "object",
"title": "Crawl History Properties",
"required": [],
"properties": {
"crawlDBType": {
"type": "string",
"title": "Crawl database type",
"description": "The type of crawl database to use, in-memory or on-disk.",
"default": "on-disk",
"hints": [
"advanced",
"v1LegacyId:crawlDBType"
]
},
"commitAfterItems": {
"type": "number",
"title": "Commit After This Many Items",
"description": "Commit the crawlDB to disk after this many items have been received. A smaller number here will result in a slower crawl because of commits to disk being more frequent; conversely, a larger number here will cause a resumed job after a crash to need to recrawl more records.",
"default": 10000,
"hints": [
"advanced",
"v1LegacyId:commitAfterItems"
],
"maximum": 9999999,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
},
"retainOutlinks": {
"type": "boolean",
"title": "Retain links in the crawldb",
"description": "Set to true for links found during fetching to be stored in the crawldb. This increases precision in certain recrawl scenarios, but requires more memory and disk space.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:retainOutlinks"
]
},
"aliasExpiration": {
"type": "number",
"title": "Alias expiration",
"description": "The number of crawls after which an alias will expire. The default is 1 crawl.",
"default": 1,
"hints": [
"advanced",
"v1LegacyId:aliasExpiration"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"discardLinkURLQueries": {
"type": "boolean",
"title": "Discard queries in link URLs",
"description": "If true, query parameters found in URLs will be removed before being added to the discovery queue.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.discardLinkURLQueries"
]
},
"discardLinkURLAnchors": {
"type": "boolean",
"title": "Discard anchors in link URLs",
"description": "If true, anchors found in URLs will be removed before being added to the discovery queue.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.discardLinkURLAnchors"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.CrawlHistoryConfig"
]
},
"crawlIdConfig": {
"type": "object",
"title": "Crawl Id Properties",
"required": [],
"properties": {
"userAgentName": {
"type": "string",
"title": "HTTP user-agent name",
"description": "Name the connector should use when identifying itself to a website in order to crawl it.",
"default": "Lucidworks-Anda/2.0",
"hints": [
"advanced",
"v1LegacyId:f.userAgentName"
]
},
"userAgentEmail": {
"type": "string",
"title": "HTTP user-agent email address",
"description": "Email address to use as part of connector identification.",
"hints": [
"v1LegacyId:f.userAgentEmail",
"advanced"
]
},
"userAgentWebAddr": {
"type": "string",
"title": "HTTP user-agent web address",
"description": "Web address to use as part of connector identification.",
"hints": [
"v1LegacyId:f.userAgentWebAddr",
"advanced"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.CrawlIdConfig"
]
},
"crawlPerformanceConfig": {
"type": "object",
"title": "Crawl Performance Properties",
"required": [],
"properties": {
"fetchDelayMSPerHost": {
"type": "boolean",
"title": "Fetch delay per host",
"description": "If true, the 'Fetch delay (ms)' property will be applied for each host.",
"default": false,
"hints": [
"v1LegacyId:fetchDelayMSPerHost",
"advanced"
]
},
"fetchThreads": {
"type": "number",
"title": "Fetch threads",
"description": "The number of threads to use during fetching. The default is 5.",
"default": 5,
"hints": [
"v1LegacyId:fetchThreads"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"emitThreads": {
"type": "number",
"title": "Emit threads",
"description": "The number of threads used to send documents from the connector to the index pipeline. The default is 5.",
"default": 5,
"hints": [
"v1LegacyId:emitThreads"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"chunkSize": {
"type": "number",
"title": "Fetch batch size",
"description": "The number of items to batch for each round of fetching. A higher value can make crawling faster, but memory usage is also increased. The default is 1.",
"default": 1,
"hints": [
"advanced",
"v1LegacyId:chunkSize"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchDelayMS": {
"type": "number",
"title": "Fetch delay",
"description": "Number of milliseconds to wait between fetch requests. The default is 0. This property can be used to throttle a crawl if necessary.",
"default": 0,
"hints": [
"advanced",
"v1LegacyId:fetchDelayMS"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"retryEmit": {
"type": "boolean",
"title": "Retry emits",
"description": "Set to true for emit batch failures to be retried on a document-by-document basis.",
"default": true,
"hints": [
"advanced",
"v1LegacyId:retryEmit"
]
},
"failFastOnStartLinkFailure": {
"type": "boolean",
"title": "Fail crawl if start links are invalid",
"description": "If true, when Fusion cannot connect to any of the provided start links, the crawl is stopped and an exception logged.",
"default": true,
"hints": [
"v1LegacyId:failFastOnStartLinkFailure",
"advanced"
]
},
"timeoutMS": {
"type": "number",
"title": "Connection timeout (ms)",
"description": "Time in milliseconds to wait for server response.",
"default": 10000,
"hints": [
"advanced",
"v1LegacyId:f.timeoutMS"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"requestRetryCount": {
"type": "number",
"title": "Request Retry Count",
"description": "If an http request fails, retry up to this many times before giving up. If set to 0, requests will not be retried. This is useful in situations where your crawls are failing with errors like \"The target server failed to respond\".",
"default": 0,
"hints": [
"v1LegacyId:f.requestRetryCount"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"obeyRobotsDelay": {
"type": "boolean",
"title": "Obey robots.txt Crawl-Delay",
"description": "If true, Crawl-Delay rules in robots.txt will be obeyed. Disabling this option will speed up crawling, but is considered negative behavior for sites you do not control.",
"default": true,
"hints": [
"v1LegacyId:f.obeyRobotsDelay",
"advanced"
]
},
"parserRetryCount": {
"type": "number",
"title": "Max Parser Retries",
"description": "The maximum number of times the configured parser will try getting content before giving up",
"default": 0,
"hints": [
"v1LegacyId:parserRetryCount"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.CrawlPerformanceConfig"
]
},
"dedupeConfig": {
"type": "object",
"title": "Dedupe Properties",
"required": [],
"properties": {
"dedupe": {
"type": "boolean",
"title": "Dedupe documents",
"description": "If true, documents will be deduplicated. Deduplication can be done based on an analysis of the content, on the content of a specific field, or by a JavaScript function. If neither a field nor a script are defined, content analysis will be used.",
"default": false,
"hints": [
"v1LegacyId:dedupe",
"advanced"
]
},
"dedupeField": {
"type": "string",
"title": "Dedupe field",
"description": "Field to be used for dedupe. Define either a field or a dedupe script, otherwise the full raw content of each document will be used.",
"hints": [
"v1LegacyId:dedupeField",
"advanced"
]
},
"dedupeScript": {
"type": "string",
"title": "Dedupe script",
"description": "Custom javascript to dedupe documents. The script must define a 'genSignature(content){}' function, but can use any combination of document fields. The function must return a string.",
"hints": [
"code",
"advanced",
"v1LegacyId:dedupeScript",
"javascript"
]
},
"dedupeSaveSignature": {
"type": "boolean",
"title": "Save dedupe signature",
"description": "If true,the signature used for dedupe will be stored in a 'dedupeSignature_s' field. Note this may cause errors about 'immense terms' in that field.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:dedupeSaveSignature"
]
},
"followCanonicalTags": {
"type": "boolean",
"title": "Deduplication via canonical tag",
"description": "Deduplicate, by only indexing the document at the URL specified in the canonical tag. https://en.wikipedia.org/wiki/Canonical_link_element",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.followCanonicalTags"
]
},
"canonicalTagsRedirectLimit": {
"type": "number",
"title": "The maximum number of requests to perform while resolving the canonical URL for a page.",
"description": "Because canonical tag resolution may be cyclical, a limit must be applied to the total number of requests. This value ensures that the resolution finishes in a reasonable amount of time.",
"default": 4,
"hints": [
"advanced",
"v1LegacyId:f.canonicalTagsRedirectLimit"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.DedupeConfig"
]
},
"documentParsingConfig": {
"type": "object",
"title": "Document Parsing Properties",
"required": [],
"properties": {
"defaultCharSet": {
"type": "string",
"title": "Default character set",
"description": "Default character set to use when one is not declared in the HTTP headers.",
"default": "UTF-8",
"hints": [
"v1LegacyId:f.defaultCharSet",
"advanced"
]
},
"obeyCharSet": {
"type": "boolean",
"title": "Obey server-supplied charset",
"description": "Use the encoding sent by the web server (if any) when parsing content. If unset, Fusion will try to guess the character set when parsing.",
"default": true,
"hints": [
"advanced",
"v1LegacyId:f.obeyCharSet"
]
},
"defaultMIMEType": {
"type": "string",
"title": "Default MIME type",
"description": "Default MIME type to use when one is not declared in the HTTP headers.",
"default": "application/octet-stream",
"hints": [
"v1LegacyId:f.defaultMIMEType",
"advanced"
]
},
"appendTrailingSlashToLinks": {
"type": "boolean",
"title": "Add trailing slash to link URLs",
"description": "If true, a trailing '/' will be added to link URLs when the URL does not end in a dot ('.').",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.appendTrailingSlashToLinks"
]
},
"scrapeLinksBeforeFiltering": {
"type": "boolean",
"title": "Scrape links before filtering",
"description": "If true, links will be extracted from documents before any other document processing has ocurred. By default, links are extracted after all other document processing.",
"default": false,
"hints": [
"v1LegacyId:f.scrapeLinksBeforeFiltering",
"advanced"
]
},
"tagFields": {
"type": "array",
"title": "Tag fields",
"description": "HTML tags of elements to put into their own field in the index. The field will have the same name as the tag.",
"hints": [
"advanced",
"v1LegacyId:f.tagFields"
],
"items": {
"type": "string"
}
},
"tagIDFields": {
"type": "array",
"title": "Tag-ID fields",
"description": "HTML tag IDs of elements to put into their own field in the index. The field will have the same name as the tag ID.",
"hints": [
"advanced",
"v1LegacyId:f.tagIDFields"
],
"items": {
"type": "string"
}
},
"tagClassFields": {
"type": "array",
"title": "Tag-class fields",
"description": "HTML tag classes of elements to put into their own field in the index. The field will have the same name as the tag class.",
"hints": [
"v1LegacyId:f.tagClassFields",
"advanced"
],
"items": {
"type": "string"
}
},
"selectorFields": {
"type": "array",
"title": "Jsoup selector fields",
"description": "List of Jsoup selectors for elements to put into their separate field in the index. The field will have the same name as the element. Syntax for jsoup selectors is available at http://jsoup.org/apidocs/org/jsoup/select/Selector.html.",
"hints": [
"advanced",
"v1LegacyId:f.selectorFields"
],
"items": {
"type": "string"
}
},
"filteringRootTags": {
"type": "array",
"title": "Root elements to filter",
"description": "Root HTML elements whose child elements will be used to extract content. By default 'body' and 'head' elements are already included.",
"hints": [
"v1LegacyId:f.filteringRootTags",
"advanced"
],
"items": {
"type": "string"
}
},
"includeSelectors": {
"type": "array",
"title": "Jsoup inclusive selectors",
"description": "Jsoup-formatted selectors for elements to include in the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.includeSelectors"
],
"items": {
"type": "string"
}
},
"includeTags": {
"type": "array",
"title": "Included tags",
"description": "HTML tag names of elements to include in the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.includeTags"
],
"items": {
"type": "string"
}
},
"includeTagClasses": {
"type": "array",
"title": "Included tag classes",
"description": "HTML tag classes of elements to include in the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.includeTagClasses"
],
"items": {
"type": "string"
}
},
"includeTagIDs": {
"type": "array",
"title": "Included tag IDs",
"description": "HTML tag IDs of elements to include in the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.includeTagIDs"
],
"items": {
"type": "string"
}
},
"excludeSelectors": {
"type": "array",
"title": "Jsoup exclusive selectors",
"description": "Jsoup-formatted selectors for elements to exclude from the crawled content. Syntax for jsoup selectors is available at http://jsoup.org/apidocs/org/jsoup/select/Selector.html.",
"hints": [
"advanced",
"v1LegacyId:f.excludeSelectors"
],
"items": {
"type": "string"
}
},
"excludeTags": {
"type": "array",
"title": "Excluded tags",
"description": "HTML tag names of elements to exclude from the crawled content.",
"hints": [
"v1LegacyId:f.excludeTags",
"advanced"
],
"items": {
"type": "string"
}
},
"excludeTagClasses": {
"type": "array",
"title": "Excluded tag classes",
"description": "HTML tag classes of elements to exclude from the crawled content.",
"hints": [
"v1LegacyId:f.excludeTagClasses",
"advanced"
],
"items": {
"type": "string"
}
},
"excludeTagIDs": {
"type": "array",
"title": "Excluded tag IDs",
"description": "HTML tag IDs of elements to exclude from the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.excludeTagIDs"
],
"items": {
"type": "string"
}
},
"customLinkSelectors": {
"type": "array",
"title": "Custom XPath Link Selectors",
"description": "By default, only standard anchor tags, iframe tags, frame tags, and link tags are fetched. This allows you to use one or more XPath expressions to parse links from custom places. Such as //option/@value",
"hints": [
"v1LegacyId:f.customLinkSelectors"
],
"items": {
"type": "string"
}
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.DocumentParsingConfig"
]
},
"javascriptEvaluationConfig": {
"type": "object",
"title": "Javascript Evaluation Properties",
"required": [],
"properties": {
"crawlJS": {
"type": "boolean",
"title": "Evaluate Javascript",
"description": "Evaluate JavaScript on web pages when crawling. This makes it possible for the Web fetcher to extract content from pages that is only available after JavaScript has prepared the document, but it may make the crawl slower because JavaScript loading can be time consuming.",
"default": false,
"hints": [
"v1LegacyId:f.crawlJS"
]
},
"jsEnabledAuth": {
"type": "boolean",
"title": "Evaluate JavaScript during SmartForms/SAML Login",
"description": "Evaluate JavaScript when doing SAML/SmartForm authentication. This is only applicable if you have specified a SmartForms/SAML Authentication element in the \"Crawl Authentication\" area.",
"default": false,
"hints": [
"v1LegacyId:f.jsEnabledAuth"
]
},
"jsPageLoadTimeout": {
"type": "number",
"title": "Timeout",
"description": "The time to wait in milliseconds for a page load to complete. If the timeout is -1, page loads can be indefinite. Maximum: 180,000ms i.e. 3 minutes",
"default": 20000,
"hints": [
"v1LegacyId:f.jsPageLoadTimeout"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"jsScriptTimeout": {
"type": "number",
"title": "Script Timeout",
"description": "The time to wait in milliseconds wait for an asynchronous script to finish execution. If the timeout is -1, then the script will be allowed to run indefinitely. Maximum: 30,000ms",
"default": 20000,
"hints": [
"v1LegacyId:f.jsScriptTimeout"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"jsAjaxTimeout": {
"type": "number",
"title": "AJAX Timeout",
"description": "The time in milliseconds after which an AJAX request will be ignored when considering whether all AJAX requests have completed. Maximum: 180,000ms i.e. 3 minutes",
"default": 20000,
"hints": [
"v1LegacyId:f.jsAjaxTimeout"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"extraLoadTimeMs": {
"type": "number",
"title": "Extra time to wait for content after page load (ms)",
"description": "The JavaScript evaluation process will first wait for the DOM 'document.readyState' to be set to 'complete'; then it will wait until there are no more pending Ajax before emitting the page’s contents. Use this property to wait an additional number of milliseconds before emitting the contents. This gives background JavaScript routines a chance to finish rendering the page before the contents is emitted.",
"default": 250,
"hints": [
"v1LegacyId:f.extraLoadTimeMs"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"extraPageLoadDeltaChars": {
"type": "number",
"title": "Stop waiting for extraLoadTimeMs if page size increases by this many bytes.",
"description": "This parameter is used when the \"Extra time to wait for content after page load (ms)\" parameter is > 0. It will stop the additional wait time if it sees the web page's content grows by at least this many characters. If set to 0 (the default) any increase in character count indicates the page load is finished.",
"default": 0,
"hints": [
"v1LegacyId:f.extraPageLoadDeltaChars"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"quitTimeoutMs": {
"type": "number",
"title": "Web Driver Quit Timeout (milliseconds)",
"description": "The amount of time to wait for a web browser to quit before killing the browser process.",
"default": 5000,
"hints": [
"advanced",
"v1LegacyId:f.quitTimeoutMs"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"useRequestCounter": {
"type": "boolean",
"title": "Use Request Counter",
"description": "Use the request counter plugin to wait for all pending ajax requests to be complete before loading the page contents.",
"default": true,
"hints": [
"v1LegacyId:f.useRequestCounter"
]
},
"requestCounterMinWaitMs": {
"type": "number",
"title": "Request counter min wait (ms)",
"description": "When the requestcounter is enabled, often early on the requestcount may say there are 0 pending requests... but there may still be ajax requests that haven't run yet. This parameter provides a certain time in milliseconds to wait for a non-zero count to be returned. If a requestcount is non-zero at any point, then the next requestcount = 0 is assumed to signify this page is done loading.",
"default": 750,
"hints": [
"v1LegacyId:f.requestCounterMinWaitMs"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"requestCounterMaxWaitMs": {
"type": "number",
"title": "Request counter max wait (ms)",
"description": "The request counter plugin counts active ajax requests after a page was loaded until there are no more pending ajax requests. This parameter says how long to wait in milliseconds for the requestcount to go to 0 before giving up.",
"default": 20000,
"hints": [
"v1LegacyId:f.requestCounterMaxWaitMs"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"useHighPerfJsEval": {
"type": "boolean",
"title": "High Performance Mode",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"default": false,
"hints": [
"v1LegacyId:f.useHighPerfJsEval",
"hidden"
]
},
"headlessBrowser": {
"type": "boolean",
"title": "Headless Browser",
"description": "Applicable only when \"Evaluate JavaScript\" is selected, deselect this checkbox if you want to actually see browser windows display while fetchers process web pages. Otherwise, if selected, browsers will run in \"headless\" mode which means they will run in the background. If running on a server with no desktop interface, this must stay selected.",
"default": true,
"hints": [
"v1LegacyId:f.headlessBrowser"
]
},
"takeScreenshot": {
"type": "boolean",
"title": "Index a screenshot of rendered page",
"description": "Applicable only when \"Evaluate JavaScript\" is selected, take a screenshot of the fully rendered page and index it. Screenshots will be indexed in a field called \"screenshot_bin\". You must make sure your schema specifies this field as a binary field or indexing will fail. To add this, go to System -> Solr Config -> Managed Schema then add <dynamicField indexed=\"true\" name=\"*_bin\" stored=\"true\" type=\"binary\"/>",
"default": false,
"hints": [
"v1LegacyId:f.takeScreenshot"
]
},
"screenshotFullscreen": {
"type": "boolean",
"title": "Screenshots Full Screen",
"description": "When taking a screenshot, capture the full screen.",
"default": false,
"hints": [
"v1LegacyId:f.screenshotFullscreen"
]
},
"viewportWidth": {
"type": "number",
"title": "Viewport Width",
"description": "Set an optional browser viewport width. If not specified, will default to 800.",
"hints": [
"v1LegacyId:f.viewportWidth"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"viewportHeight": {
"type": "number",
"title": "Viewport Height",
"description": "Set an optional browser viewport height. If not specified, will default to 600.",
"hints": [
"v1LegacyId:f.viewportHeight"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"deviceScreenFactor": {
"type": "number",
"title": "Device Screen Factor",
"description": "Set an optional browser device screen factor. If not specified, will default to 1 (no scaling).",
"hints": [
"v1LegacyId:f.deviceScreenFactor"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"simulateMobile": {
"type": "boolean",
"title": "Simulate mobile",
"description": "Simulate a mobile device",
"default": false,
"hints": [
"v1LegacyId:f.simulateMobile",
"advanced"
]
},
"mobileScreenWidth": {
"type": "number",
"title": "Mobile screen width (Only used for simulate mobile)",
"description": "If simulate mobile is checked, this species the device's emulated screen width.",
"hints": [
"advanced",
"v1LegacyId:f.mobileScreenWidth"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"mobileScreenHeight": {
"type": "number",
"title": "Mobile screen height (Only used for simulate mobile)",
"description": "If simulate mobile is checked, this species the device's emulated screen height.",
"hints": [
"advanced",
"v1LegacyId:f.mobileScreenHeight"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"chromeBinaryPath": {
"type": "string",
"title": "chromeBinaryPath",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.chromeBinaryPath",
"advanced"
]
},
"chromeExtraCommandLineArgs": {
"type": "string",
"title": "Google Chrome Extra Command Line Options",
"description": "Specify additional command line arguments to add to the chromium executable when it is run.",
"hints": [
"advanced",
"v1LegacyId:f.chromeExtraCommandLineArgs"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.JavascriptEvaluationConfig"
]
},
"linkDiscoveryConfig": {
"type": "object",
"title": "Link Discovery",
"required": [],
"properties": {
"restrictToTreeIgnoredHostPrefixes": {
"type": "array",
"title": "Restrict crawl host prefix exemptions",
"description": "Modifies the behavior of 'Restrict crawl to start-link tree' to ignore the configured list of prefixes when restricting the crawl. Commonly, 'www.' is ignored so links with the same domain are allowed, whether of the form 'http://host.com' or 'http://www.host.com'. This option requires 'Restrict to start-link tree' to be enabled to have any effect.",
"hints": [
"v1LegacyId:restrictToTreeIgnoredHostPrefixes"
],
"items": {
"type": "string"
}
},
"restrictToTree": {
"type": "boolean",
"title": "Restrict crawl to start-link tree",
"description": "If true, only URLs that match the startLinks URL domain will be followed",
"default": true,
"hints": [
"v1LegacyId:restrictToTree"
]
},
"restrictToTreeAllowSubdomains": {
"type": "boolean",
"title": "Ignore sub-domains when restricting crawl",
"description": "Modifies the behavior of 'Restrict crawl to start-link tree' so that a link to any sub-domain of the start links is allowed. For example, if the start link is 'http://host.com', this option ensures that links to 'http://news.host.com' are also followed. This option requires 'Restrict to start-link tree' to be enabled to have any effect.",
"default": false,
"hints": [
"v1LegacyId:restrictToTreeAllowSubdomains"
]
},
"restrictToTreeUseHostAndPath": {
"type": "boolean",
"title": "Restrict crawl to start-link path",
"description": "Modifies the behavior of 'Restrict crawl to start-link tree' to include the 'path' of the start link in the restriction logic. For example, if the start link is 'http://host.com/US', this option will limit all followed URLs to ones starting with the '/US/' path. This option requires 'Restrict to start-link tree' to be enabled to have any effect.",
"default": false,
"hints": [
"v1LegacyId:restrictToTreeUseHostAndPath"
]
},
"sitemapURLs": {
"type": "array",
"title": "Sitemap URLs",
"description": "URLs for sitemaps, to be used a basis for link discovery. Rules found in sitemaps will not be processed.",
"hints": [
"v1LegacyId:f.sitemapURLs"
],
"items": {
"type": "string"
}
},
"respectMetaEquivRedirects": {
"type": "boolean",
"title": "Respect refresh redirects",
"description": "If true, the connector will follow metatags with refresh redirects such as <meta http-equiv=\"refresh\" />.",
"default": false,
"hints": [
"v1LegacyId:f.respectMetaEquivRedirects"
]
},
"allowCircularRedirects": {
"type": "boolean",
"title": "Allow circular redirects",
"description": "If true, a request can be redirected to the same URL multiple times",
"default": false,
"hints": [
"v1LegacyId:f.allowCircularRedirects"
]
},
"addedHeaders": {
"type": "string",
"title": "Headers to add to HTTP requests",
"description": "Add these headers to http requests. This is useful for web sites that require certain headers to let you visit them. Write each header on its own line in the format HeaderName: HeaderValue",
"hints": [
"lengthy",
"v1LegacyId:f.addedHeaders"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.LinkDiscoveryConfig"
]
},
"recrawlRulesConfig": {
"type": "object",
"title": "Recrawl Rules",
"required": [],
"properties": {
"delete": {
"type": "boolean",
"title": "Delete dead URIs",
"description": "Set to true to remove documents from the index when they can no longer be accessed as unique documents.",
"default": true,
"hints": [
"v1LegacyId:delete",
"advanced"
]
},
"deleteErrorsAfter": {
"type": "number",
"title": "Fetch failure allowance",
"description": "Number of times a website can error out, for example with a 500 error or a connection timeout, before a document is removed from the index. The default of -1 means such documents are never removed. Note that pages that return a 404 status code can be configured to be removed immediately regardless of this setting.",
"default": -1,
"hints": [
"v1LegacyId:deleteErrorsAfter",
"advanced"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"refreshAll": {
"type": "boolean",
"title": "Recrawl all items",
"description": "Set to true to always recrawl all items found in the crawldb.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:refreshAll"
]
},
"refreshStartLinks": {
"type": "boolean",
"title": "Recrawl start links",
"description": "Set to true to recrawl items specified in the list of start links.",
"default": false,
"hints": [
"v1LegacyId:refreshStartLinks",
"advanced"
]
},
"refreshErrors": {
"type": "boolean",
"title": "Recrawl errors",
"description": "Set to true to recrawl items that failed during the last crawl.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:refreshErrors"
]
},
"refreshOlderThan": {
"type": "number",
"title": "Recrawl age",
"description": "Number of seconds to recrawl items whose last fetched date is longer ago than this value.",
"default": -1,
"hints": [
"advanced",
"v1LegacyId:refreshOlderThan"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"refreshIDPrefixes": {
"type": "array",
"title": "Recrawl ID prefixes",
"description": "A prefix to recrawl all items whose IDs begin with this value.",
"hints": [
"advanced",
"v1LegacyId:refreshIDPrefixes"
],
"items": {
"type": "string"
}
},
"refreshIDRegexes": {
"type": "array",
"title": "Recrawl ID regexes",
"description": "A regular expression to recrawl all items whose IDs match this pattern.",
"hints": [
"v1LegacyId:refreshIDRegexes",
"advanced"
],
"items": {
"type": "string"
}
},
"refreshScript": {
"type": "string",
"title": "Recrawl script",
"description": "A JavaScript function ('shouldRefresh()') to customize the items recrawled. ",
"hints": [
"code",
"advanced",
"v1LegacyId:refreshScript",
"javascript"
]
},
"forceRefresh": {
"type": "boolean",
"title": "Force recrawl",
"description": "Set to true to recrawl all items even if they have not changed since the last crawl.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:forceRefresh"
]
},
"forceRefreshClearSignatures": {
"type": "boolean",
"title": "Clear signatures",
"description": "If true, signatures will be cleared if force recrawl is enabled.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:forceRefreshClearSignatures"
]
},
"delete404": {
"type": "boolean",
"title": "Remove 404/410 pages",
"description": "Select this option to delete indexed pages that return a 404 or 410 error.",
"default": true,
"hints": [
"advanced",
"v1LegacyId:delete404"
]
},
"sitemapIncrementalCrawling": {
"type": "boolean",
"title": "Process Sitemap URLs",
"description": "When enabled, only URLs found in the sitemap will be processed and crawled.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:sitemap_incremental_crawling"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.RecrawlRulesConfig"
]
},
"cookieSpec": {
"type": "string",
"title": "Cookie spec",
"default": "browser-compatibility",
"hints": [
"v1LegacyId:f.cookieSpec"
]
},
"rewriteLinkScript": {
"type": "string",
"title": "URI rewrite script",
"description": "A Javascript function 'rewriteLink(link) { }' to modify links to documents before they are fetched.",
"hints": [
"code",
"advanced",
"v1LegacyId:rewriteLinkScript",
"javascript"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.WebConfig$Properties",
"com.lucidworks.fusion.connector.plugin.api.config.ConnectorPluginProperties"
]
},
"id": {
"type": "string",
"title": "Configuration ID",
"description": "A unique identifier for this Configuration.",
"minLength": 1,
"pattern": "^[a-zA-Z0-9_-]+$"
},
"pipelineId": {
"type": "string",
"title": "Pipeline ID",
"description": "Name of the IndexPipeline used for processing output.",
"default": "lucidworks-web",
"minLength": 1,
"pattern": "^[a-zA-Z0-9_-]+$"
},
"parserId": {
"type": "string",
"title": "Parser ID",
"description": "The Parser to use in the associated IndexPipeline.",
"default": "lucidworks-web",
"pattern": "^[a-zA-Z0-9_-]+$"
},
"description": {
"type": "string",
"title": "Description",
"description": "Optional description",
"hints": [
"lengthy"
],
"maxLength": 125
},
"type": {
"type": "string",
"title": "Type",
"description": "A type ID for this connector.",
"hints": [
"readonly",
"hidden"
]
},
"created": {
"type": "string",
"title": "Date Created",
"description": "The date at which this Configuration was created.",
"hints": [
"readonly",
"hidden"
]
},
"modified": {
"type": "string",
"title": "Date Modified",
"description": "The date at which this Configuration was last modified.",
"hints": [
"readonly",
"hidden"
]
},
"diagnosticLogging": {
"type": "boolean",
"title": "Diagnostic Logging",
"description": "Enable diagnostic logging; disabled by default",
"default": false
},
"collectionId": {
"type": "string",
"title": "Collection ID",
"description": "The associated content Collection.",
"hints": [
"readonly",
"hidden"
],
"minLength": 1,
"pattern": "^[a-zA-Z0-9_-]+$"
},
"coreProperties": {
"type": "object",
"title": "Core Properties",
"description": "Common behavior and performance settings.",
"required": [],
"properties": {
"fetchSettings": {
"type": "object",
"title": "Fetch Settings",
"description": "System level settings for controlling fetch behavior and performance.",
"required": [],
"properties": {
"indexingThreads": {
"type": "number",
"title": "Index Subscription Threads",
"description": "Maximum number of indexing threads; defaults to 4.This setting controls the number of threads in the indexing service used for processing content documents emitted by this datasource.Higher values can sometimes help with overall fetch performance.",
"default": 4,
"maximum": 10,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
},
"pluginInstances": {
"type": "number",
"title": "Number of plugin instances for distributed fetching",
"description": "Maximum number of plugin instances for distributed fetching. Only specified number of plugin instanceswill do fetching. This is useful for distributing load between different instances.",
"default": 0,
"maximum": 500,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchItemQueueSize": {
"type": "number",
"title": "Fetch Item Queue Size",
"description": "Size of the fetch item queue.Larger values result in increased memory usage, but potentially higher performance.Default is 10k.",
"default": 10000,
"hints": [
"hidden"
],
"maximum": 500000,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchRequestCheckInterval": {
"type": "number",
"title": "Fetch request check interval(ms)",
"description": "The amount of time to wait before check if a request is done",
"default": 15000,
"hints": [
"hidden"
],
"maximum": 500000,
"exclusiveMaximum": false,
"minimum": 1000,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchResponseScheduledTimeout": {
"type": "number",
"title": "Fetch response scheduled timeout(ms)",
"description": "The maximum amount of time for a response to be scheduled. The task will be canceled if this setting is exceeded.",
"default": 300000,
"maximum": 500000,
"exclusiveMaximum": false,
"minimum": 1000,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchResponseCompletedTimeout": {
"type": "number",
"title": "Fetch response completion timeout(ms)",
"description": "The maximum amount of time for a response to be completed. If exceeded, the task will be retried if the job is still running",
"default": 300000,
"hints": [
"hidden"
],
"maximum": 600000,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
},
"indexingInactivityTimeout": {
"type": "number",
"title": "Indexing inactivity timeout(seconds)",
"description": "The maximum amount of time to wait for indexing results (in seconds). If exceeded, the job will fail with an indexing inactivity timeout.",
"default": 86400,
"maximum": 691200,
"exclusiveMaximum": false,
"minimum": 60,
"exclusiveMinimum": false,
"multipleOf": 1
},
"pluginInactivityTimeout": {
"type": "number",
"title": "Plugin inactivity timeout(seconds)",
"description": "The maximum amount of time to wait for plugin activity (in seconds). If exceeded, the job will fail with a plugin inactivity timeout.",
"default": 600,
"maximum": 691200,
"exclusiveMaximum": false,
"minimum": 60,
"exclusiveMinimum": false,
"multipleOf": 1
},
"indexMetadata": {
"type": "boolean",
"title": "Index metadata",
"description": "When enabled the metadata of skipped items will be indexed to the content collection.",
"default": false
},
"indexContentFields": {
"type": "boolean",
"title": "Index content fields",
"description": "When enabled, content fields will be indexed to the crawl-db collection.",
"default": false
},
"asyncParsing": {
"type": "boolean",
"title": "Async Parsing",
"description": "When enabled, content will be indexed asynchronously.",
"default": false
},
"numFetchThreads": {
"type": "number",
"title": "Fetch Threads",
"description": "Maximum number of fetch threads; defaults to 5.This setting controls the number of threads that call the Connectors fetch method.Higher values can, but not always, help with overall fetch performance.",
"default": 5,
"maximum": 500,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
}
},
"interfaces": [
"com.lucidworks.fusion.connector.plugin.api.config.CoreConfig$FetchSystemSettings"
]
}
},
"interfaces": [
"com.lucidworks.fusion.connector.plugin.api.config.CoreConfig"
],
"hints": [
"advanced"
]
}
},
"category": "Web",
"interfaces": [
"com.lucidworks.connector.plugins.web.config.WebConfig",
"com.lucidworks.fusion.connector.plugin.api.config.ConnectorConfig"
]
}
}
Return the plugin schema definition. V2 plugins only.
curl --request GET \
--url https://{FUSION HOST}/api/connectors/schema/{type}
{
"id": "PpwsTj7f39",
"name": "lucidworks.web-v2",
"pluginVersion": "1.3.0",
"sdkVersion": "4.1.3",
"schema": {
"type": "object",
"title": "Web (v2)",
"description": "Connector for websites and web-based content resources.",
"required": [
"id",
"properties",
"pipelineId"
],
"properties": {
"properties": {
"type": "object",
"title": "Properties",
"description": "Plugin specific properties.",
"required": [],
"properties": {
"startLinks": {
"type": "array",
"title": "Start Links",
"description": "The URL(s) that the crawler will start crawling from, for example: https://en.wikipedia.org/wiki/Main_Page",
"hints": [
"v1LegacyId:startLinks"
],
"items": {
"type": "string"
}
},
"bulkStartLinks": {
"type": "string",
"title": "Bulk start links",
"description": "If a large number of start links must be defined, you can provide them here. One link per line.",
"hints": [
"lengthy"
]
},
"limitDocumentsConfig": {
"type": "object",
"title": "Limit Documents Properties",
"required": [],
"properties": {
"depth": {
"type": "number",
"title": "Max crawling depth",
"description": "Number of levels in a directory or site tree to descend for documents.",
"default": -1,
"hints": [
"v1LegacyId:depth"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"maxItems": {
"type": "number",
"title": "Max items",
"description": "Maximum number of documents to fetch. The default (-1) means no limit.",
"default": -1,
"hints": [
"v1LegacyId:maxItems"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"excludeExtensions": {
"type": "array",
"title": "Excluded file extensions",
"description": "File extensions that should not to be fetched. This will limit this datasource to all extensions except this list.",
"hints": [
"v1LegacyId:excludeExtensions"
],
"items": {
"type": "string"
}
},
"excludeRegexes": {
"type": "array",
"title": "Exclusive regexes",
"description": "Regular expressions for URI patterns to exclude. This will limit this datasource to only URIs that do not match the regular expression.",
"hints": [
"v1LegacyId:excludeRegexes"
],
"items": {
"type": "string"
}
},
"includeExtensions": {
"type": "array",
"title": "Included file extensions",
"description": "File extensions to be fetched. This will limit this datasource to only these file extensions.",
"hints": [
"v1LegacyId:includeExtensions"
],
"items": {
"type": "string"
}
},
"includeRegexes": {
"type": "array",
"title": "Inclusive regexes",
"description": "Regular expressions for URI patterns to include. This will limit this datasource to only URIs that match the regular expression.",
"hints": [
"v1LegacyId:includeRegexes"
],
"items": {
"type": "string"
}
},
"maxSizeBytes": {
"type": "number",
"title": "Max file size (bytes)",
"description": "Maximum size, in bytes, of a document to fetch.",
"default": 20000000,
"hints": [
"v1LegacyId:f.maxSizeBytes"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"indexItemsDiscarded": {
"type": "boolean",
"title": "Index discarded document metadata",
"description": "Enable to index discarded document metadata",
"default": false,
"hints": [
"v1LegacyId:f.index_items_discarded"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.LimitDocumentsConfig"
]
},
"crawlAuthenticationConfig": {
"type": "object",
"title": "Crawl Authentication Properties",
"required": [],
"properties": {
"maintainCookies": {
"type": "boolean",
"title": "Maintain Cookies for No-Auth Crawls",
"description": "If you are not using authentication, then by default cookies are not stored in between web requests (stateless). If checked, cookies will be maintained between requests during the web crawl even when you are not using authentication. If you are using authentication, this checkbox has no effect on the crawl and can be ignored.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.maintainCookies"
]
},
"basicAuth": {
"type": "array",
"title": "Basic Authentication",
"description": "Settings for Basic authentication",
"items": {
"type": "object",
"required": [],
"properties": {
"host": {
"type": "string",
"title": "Host",
"description": "The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)",
"hints": [
"v1LegacyId:host"
]
},
"port": {
"type": "number",
"title": "Port",
"description": "The port. You can specify -1 to authenticate against any port",
"hints": [
"v1LegacyId:port"
],
"maximum": 65535,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false,
"multipleOf": 1
},
"realm": {
"type": "string",
"title": "Realm",
"description": "The realm for the site, if applicable",
"hints": [
"v1LegacyId:realm"
]
},
"username": {
"type": "string",
"title": "Username",
"description": "The username to use for authentication.",
"hints": [
"v1LegacyId:userName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for authentication.",
"hints": [
"v1LegacyId:password",
"secret"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.BasicAuthConfig"
]
}
},
"digestAuth": {
"type": "array",
"title": "Digest Authentication",
"description": "Settings for Digest authentication",
"items": {
"type": "object",
"required": [],
"properties": {
"host": {
"type": "string",
"title": "Host",
"description": "The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)",
"hints": [
"v1LegacyId:host"
]
},
"port": {
"type": "number",
"title": "Port",
"description": "The port. You can specify -1 to authenticate against any port",
"hints": [
"v1LegacyId:port"
],
"maximum": 65535,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false,
"multipleOf": 1
},
"realm": {
"type": "string",
"title": "Realm",
"description": "The realm for the site, if applicable",
"hints": [
"v1LegacyId:realm"
]
},
"username": {
"type": "string",
"title": "Username",
"description": "The username to use for authentication.",
"hints": [
"v1LegacyId:userName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for authentication.",
"hints": [
"v1LegacyId:password",
"secret"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.DigestAuthConfig"
]
}
},
"ntlmAuth": {
"type": "array",
"title": "NTLM Authentication",
"description": "Settings for NTLM authentication",
"items": {
"type": "object",
"required": [],
"properties": {
"domain": {
"type": "string",
"title": "Domain",
"description": "The NTLM Domain",
"hints": [
"v1LegacyId:domain"
]
},
"workstation": {
"type": "string",
"title": "Workstation",
"description": "The NTLM Workstation name",
"hints": [
"v1LegacyId:workstation"
]
},
"host": {
"type": "string",
"title": "Host",
"description": "The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)",
"hints": [
"v1LegacyId:host"
]
},
"port": {
"type": "number",
"title": "Port",
"description": "The port. You can specify -1 to authenticate against any port",
"hints": [
"v1LegacyId:port"
],
"maximum": 65535,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false,
"multipleOf": 1
},
"realm": {
"type": "string",
"title": "Realm",
"description": "The realm for the site, if applicable",
"hints": [
"v1LegacyId:realm"
]
},
"username": {
"type": "string",
"title": "Username",
"description": "The username to use for authentication.",
"hints": [
"v1LegacyId:userName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for authentication.",
"hints": [
"v1LegacyId:password",
"secret"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.NtlmAuthConfig",
"com.lucidworks.connector.plugins.web.config.BasicAuthConfig"
]
}
},
"formAuth": {
"type": "array",
"title": "Form Authentication",
"description": "Settings for Form based authentication",
"items": {
"type": "object",
"required": [],
"properties": {
"action": {
"type": "string",
"title": "URL",
"description": "The URL of the authentication endpoint",
"hints": [
"v1LegacyId:action"
]
},
"ttl": {
"type": "number",
"title": "TTL (ms)",
"description": "The \"time to live\" in milliseconds for the session that will be created after authentication.",
"hints": [
"v1LegacyId:ttl"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"passwordParamName": {
"type": "string",
"title": "Password Parameter",
"description": "Name of the parameter containing the password",
"hints": [
"v1LegacyId:passwordParamName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for the authentication request. This will be copied into the \"Parameters\" using the \"Password Parameter\" name as the key",
"hints": [
"v1LegacyId:password",
"secret"
]
},
"paramsList": {
"type": "array",
"title": "Parameters",
"description": "Parameters sent to the authentication endpoint",
"items": {
"type": "object",
"required": [],
"properties": {
"key": {
"type": "string",
"title": "Parameters Key",
"hints": [
"v1LegacyId:paramsKey"
]
},
"value": {
"type": "string",
"title": "Parameters value",
"hints": [
"v1LegacyId:paramsValue"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.FormAuthConfig$Params"
]
}
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.FormAuthConfig"
]
}
},
"samlAuth": {
"type": "array",
"title": "SAML/Smart Form Authentication",
"description": "Settings for SAML/Smart Form based authentication allows you to visit one or more web pages that contain form inputs such as username, password, security questions, etc., submitting each one in turn in order to become authenticated.",
"items": {
"type": "object",
"required": [],
"properties": {
"action": {
"type": "string",
"title": "URL",
"description": "The URL of the authentication endpoint",
"hints": [
"v1LegacyId:action"
]
},
"ttl": {
"type": "number",
"title": "TTL (ms)",
"description": "The \"time to live\" in milliseconds for the session that will be created after authentication.",
"hints": [
"v1LegacyId:ttl"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"passwordParamName": {
"type": "string",
"title": "Password Parameter",
"description": "Name of the parameter containing the password",
"hints": [
"v1LegacyId:passwordParamName"
]
},
"password": {
"type": "string",
"title": "Password",
"description": "The password to use for the authentication request. This will be copied into the \"Parameters\" using the \"Password Parameter\" name as the key",
"hints": [
"v1LegacyId:password",
"secret"
]
},
"paramsList": {
"type": "array",
"title": "Parameters",
"description": "Parameters sent to the authentication endpoint",
"items": {
"type": "object",
"required": [],
"properties": {
"key": {
"type": "string",
"title": "Parameters Key",
"hints": [
"v1LegacyId:paramsKey"
]
},
"value": {
"type": "string",
"title": "Parameters value",
"hints": [
"v1LegacyId:paramsValue"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.FormAuthConfig$Params"
]
}
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.SamlAuthConfig",
"com.lucidworks.connector.plugins.web.config.FormAuthConfig"
]
}
},
"credentialsFile": {
"type": "string",
"title": "credentialsFile",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.credentialsFile"
]
},
"kerberosEnabled": {
"type": "boolean",
"title": "kerberosEnabled",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"default": false,
"hints": [
"v1LegacyId:f.kerberosEnabled"
]
},
"kerberosLoginContextName": {
"type": "string",
"title": "kerberosLoginContextName",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosLoginContextName"
]
},
"kerberosSpn": {
"type": "string",
"title": "kerberosSpn",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosSpn"
]
},
"kerberosPrincipal": {
"type": "string",
"title": "kerberosPrincipal",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosPrincipal"
]
},
"kerberosKeytabFile": {
"type": "string",
"title": "kerberosKeytabFile",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosKeytabFile"
]
},
"kerberosKeytabBase64": {
"type": "string",
"title": "kerberosKeytabBase64",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosKeytabBase64"
]
},
"kerberosPassword": {
"type": "string",
"title": "kerberosPassword",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.kerberosPassword",
"secret"
]
},
"obeyRobots": {
"type": "boolean",
"title": "Obey robots.txt",
"description": "If true, Allow, Disallow and other rules found in a robots.txt file will be obeyed.",
"default": false,
"hints": [
"v1LegacyId:f.obeyRobots"
]
},
"obeyRobotsMeta": {
"type": "boolean",
"title": "Obey robots meta tags and headers",
"description": "If true, rules like 'noindex', 'nofollow' and others found in a robots meta tag on a page or in the headers of the HTTP response are obeyed.",
"default": false,
"hints": [
"v1LegacyId:f.obeyRobotsMeta"
]
},
"obeyLinkNofollow": {
"type": "boolean",
"title": "Obey link nofollow attributes",
"description": "If true, rel='nofollow' on links are obeyed",
"default": false,
"hints": [
"v1LegacyId:f.obeyLinkNofollow"
]
},
"proxy": {
"type": "string",
"title": "HTTP proxy address",
"description": "Address of the HTTP proxy, if required. This should be entered in the format host:port.",
"hints": [
"v1LegacyId:f.proxy"
]
},
"allowAllCertificates": {
"type": "boolean",
"title": "Allow all HTTPS certificates",
"description": "If false, security checks will be performed on all SSL/TLS certificate signers and origins. This means self-signed certificates would not be supported.",
"default": false,
"hints": [
"v1LegacyId:f.allowAllCertificates"
]
},
"useIpAddressForSslConnections": {
"type": "boolean",
"title": "Use IP address for SSL connections",
"description": "Use IP address instead of host name for SSL connections. This is used to work around mis-configured HTTP server throwing 'unrecognized name' error whenSNI is enabled. (This only works if 'Allow all certificates' setting is also enabled)",
"default": false,
"hints": [
"v1LegacyId:f.useIpAddressForSslConnections"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.CrawlAuthenticationConfig"
]
},
"crawlHistoryConfig": {
"type": "object",
"title": "Crawl History Properties",
"required": [],
"properties": {
"crawlDBType": {
"type": "string",
"title": "Crawl database type",
"description": "The type of crawl database to use, in-memory or on-disk.",
"default": "on-disk",
"hints": [
"advanced",
"v1LegacyId:crawlDBType"
]
},
"commitAfterItems": {
"type": "number",
"title": "Commit After This Many Items",
"description": "Commit the crawlDB to disk after this many items have been received. A smaller number here will result in a slower crawl because of commits to disk being more frequent; conversely, a larger number here will cause a resumed job after a crash to need to recrawl more records.",
"default": 10000,
"hints": [
"advanced",
"v1LegacyId:commitAfterItems"
],
"maximum": 9999999,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
},
"retainOutlinks": {
"type": "boolean",
"title": "Retain links in the crawldb",
"description": "Set to true for links found during fetching to be stored in the crawldb. This increases precision in certain recrawl scenarios, but requires more memory and disk space.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:retainOutlinks"
]
},
"aliasExpiration": {
"type": "number",
"title": "Alias expiration",
"description": "The number of crawls after which an alias will expire. The default is 1 crawl.",
"default": 1,
"hints": [
"advanced",
"v1LegacyId:aliasExpiration"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"discardLinkURLQueries": {
"type": "boolean",
"title": "Discard queries in link URLs",
"description": "If true, query parameters found in URLs will be removed before being added to the discovery queue.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.discardLinkURLQueries"
]
},
"discardLinkURLAnchors": {
"type": "boolean",
"title": "Discard anchors in link URLs",
"description": "If true, anchors found in URLs will be removed before being added to the discovery queue.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.discardLinkURLAnchors"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.CrawlHistoryConfig"
]
},
"crawlIdConfig": {
"type": "object",
"title": "Crawl Id Properties",
"required": [],
"properties": {
"userAgentName": {
"type": "string",
"title": "HTTP user-agent name",
"description": "Name the connector should use when identifying itself to a website in order to crawl it.",
"default": "Lucidworks-Anda/2.0",
"hints": [
"advanced",
"v1LegacyId:f.userAgentName"
]
},
"userAgentEmail": {
"type": "string",
"title": "HTTP user-agent email address",
"description": "Email address to use as part of connector identification.",
"hints": [
"v1LegacyId:f.userAgentEmail",
"advanced"
]
},
"userAgentWebAddr": {
"type": "string",
"title": "HTTP user-agent web address",
"description": "Web address to use as part of connector identification.",
"hints": [
"v1LegacyId:f.userAgentWebAddr",
"advanced"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.CrawlIdConfig"
]
},
"crawlPerformanceConfig": {
"type": "object",
"title": "Crawl Performance Properties",
"required": [],
"properties": {
"fetchDelayMSPerHost": {
"type": "boolean",
"title": "Fetch delay per host",
"description": "If true, the 'Fetch delay (ms)' property will be applied for each host.",
"default": false,
"hints": [
"v1LegacyId:fetchDelayMSPerHost",
"advanced"
]
},
"fetchThreads": {
"type": "number",
"title": "Fetch threads",
"description": "The number of threads to use during fetching. The default is 5.",
"default": 5,
"hints": [
"v1LegacyId:fetchThreads"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"emitThreads": {
"type": "number",
"title": "Emit threads",
"description": "The number of threads used to send documents from the connector to the index pipeline. The default is 5.",
"default": 5,
"hints": [
"v1LegacyId:emitThreads"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"chunkSize": {
"type": "number",
"title": "Fetch batch size",
"description": "The number of items to batch for each round of fetching. A higher value can make crawling faster, but memory usage is also increased. The default is 1.",
"default": 1,
"hints": [
"advanced",
"v1LegacyId:chunkSize"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchDelayMS": {
"type": "number",
"title": "Fetch delay",
"description": "Number of milliseconds to wait between fetch requests. The default is 0. This property can be used to throttle a crawl if necessary.",
"default": 0,
"hints": [
"advanced",
"v1LegacyId:fetchDelayMS"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"retryEmit": {
"type": "boolean",
"title": "Retry emits",
"description": "Set to true for emit batch failures to be retried on a document-by-document basis.",
"default": true,
"hints": [
"advanced",
"v1LegacyId:retryEmit"
]
},
"failFastOnStartLinkFailure": {
"type": "boolean",
"title": "Fail crawl if start links are invalid",
"description": "If true, when Fusion cannot connect to any of the provided start links, the crawl is stopped and an exception logged.",
"default": true,
"hints": [
"v1LegacyId:failFastOnStartLinkFailure",
"advanced"
]
},
"timeoutMS": {
"type": "number",
"title": "Connection timeout (ms)",
"description": "Time in milliseconds to wait for server response.",
"default": 10000,
"hints": [
"advanced",
"v1LegacyId:f.timeoutMS"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"requestRetryCount": {
"type": "number",
"title": "Request Retry Count",
"description": "If an http request fails, retry up to this many times before giving up. If set to 0, requests will not be retried. This is useful in situations where your crawls are failing with errors like \"The target server failed to respond\".",
"default": 0,
"hints": [
"v1LegacyId:f.requestRetryCount"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"obeyRobotsDelay": {
"type": "boolean",
"title": "Obey robots.txt Crawl-Delay",
"description": "If true, Crawl-Delay rules in robots.txt will be obeyed. Disabling this option will speed up crawling, but is considered negative behavior for sites you do not control.",
"default": true,
"hints": [
"v1LegacyId:f.obeyRobotsDelay",
"advanced"
]
},
"parserRetryCount": {
"type": "number",
"title": "Max Parser Retries",
"description": "The maximum number of times the configured parser will try getting content before giving up",
"default": 0,
"hints": [
"v1LegacyId:parserRetryCount"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.CrawlPerformanceConfig"
]
},
"dedupeConfig": {
"type": "object",
"title": "Dedupe Properties",
"required": [],
"properties": {
"dedupe": {
"type": "boolean",
"title": "Dedupe documents",
"description": "If true, documents will be deduplicated. Deduplication can be done based on an analysis of the content, on the content of a specific field, or by a JavaScript function. If neither a field nor a script are defined, content analysis will be used.",
"default": false,
"hints": [
"v1LegacyId:dedupe",
"advanced"
]
},
"dedupeField": {
"type": "string",
"title": "Dedupe field",
"description": "Field to be used for dedupe. Define either a field or a dedupe script, otherwise the full raw content of each document will be used.",
"hints": [
"v1LegacyId:dedupeField",
"advanced"
]
},
"dedupeScript": {
"type": "string",
"title": "Dedupe script",
"description": "Custom javascript to dedupe documents. The script must define a 'genSignature(content){}' function, but can use any combination of document fields. The function must return a string.",
"hints": [
"code",
"advanced",
"v1LegacyId:dedupeScript",
"javascript"
]
},
"dedupeSaveSignature": {
"type": "boolean",
"title": "Save dedupe signature",
"description": "If true,the signature used for dedupe will be stored in a 'dedupeSignature_s' field. Note this may cause errors about 'immense terms' in that field.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:dedupeSaveSignature"
]
},
"followCanonicalTags": {
"type": "boolean",
"title": "Deduplication via canonical tag",
"description": "Deduplicate, by only indexing the document at the URL specified in the canonical tag. https://en.wikipedia.org/wiki/Canonical_link_element",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.followCanonicalTags"
]
},
"canonicalTagsRedirectLimit": {
"type": "number",
"title": "The maximum number of requests to perform while resolving the canonical URL for a page.",
"description": "Because canonical tag resolution may be cyclical, a limit must be applied to the total number of requests. This value ensures that the resolution finishes in a reasonable amount of time.",
"default": 4,
"hints": [
"advanced",
"v1LegacyId:f.canonicalTagsRedirectLimit"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.DedupeConfig"
]
},
"documentParsingConfig": {
"type": "object",
"title": "Document Parsing Properties",
"required": [],
"properties": {
"defaultCharSet": {
"type": "string",
"title": "Default character set",
"description": "Default character set to use when one is not declared in the HTTP headers.",
"default": "UTF-8",
"hints": [
"v1LegacyId:f.defaultCharSet",
"advanced"
]
},
"obeyCharSet": {
"type": "boolean",
"title": "Obey server-supplied charset",
"description": "Use the encoding sent by the web server (if any) when parsing content. If unset, Fusion will try to guess the character set when parsing.",
"default": true,
"hints": [
"advanced",
"v1LegacyId:f.obeyCharSet"
]
},
"defaultMIMEType": {
"type": "string",
"title": "Default MIME type",
"description": "Default MIME type to use when one is not declared in the HTTP headers.",
"default": "application/octet-stream",
"hints": [
"v1LegacyId:f.defaultMIMEType",
"advanced"
]
},
"appendTrailingSlashToLinks": {
"type": "boolean",
"title": "Add trailing slash to link URLs",
"description": "If true, a trailing '/' will be added to link URLs when the URL does not end in a dot ('.').",
"default": false,
"hints": [
"advanced",
"v1LegacyId:f.appendTrailingSlashToLinks"
]
},
"scrapeLinksBeforeFiltering": {
"type": "boolean",
"title": "Scrape links before filtering",
"description": "If true, links will be extracted from documents before any other document processing has ocurred. By default, links are extracted after all other document processing.",
"default": false,
"hints": [
"v1LegacyId:f.scrapeLinksBeforeFiltering",
"advanced"
]
},
"tagFields": {
"type": "array",
"title": "Tag fields",
"description": "HTML tags of elements to put into their own field in the index. The field will have the same name as the tag.",
"hints": [
"advanced",
"v1LegacyId:f.tagFields"
],
"items": {
"type": "string"
}
},
"tagIDFields": {
"type": "array",
"title": "Tag-ID fields",
"description": "HTML tag IDs of elements to put into their own field in the index. The field will have the same name as the tag ID.",
"hints": [
"advanced",
"v1LegacyId:f.tagIDFields"
],
"items": {
"type": "string"
}
},
"tagClassFields": {
"type": "array",
"title": "Tag-class fields",
"description": "HTML tag classes of elements to put into their own field in the index. The field will have the same name as the tag class.",
"hints": [
"v1LegacyId:f.tagClassFields",
"advanced"
],
"items": {
"type": "string"
}
},
"selectorFields": {
"type": "array",
"title": "Jsoup selector fields",
"description": "List of Jsoup selectors for elements to put into their separate field in the index. The field will have the same name as the element. Syntax for jsoup selectors is available at http://jsoup.org/apidocs/org/jsoup/select/Selector.html.",
"hints": [
"advanced",
"v1LegacyId:f.selectorFields"
],
"items": {
"type": "string"
}
},
"filteringRootTags": {
"type": "array",
"title": "Root elements to filter",
"description": "Root HTML elements whose child elements will be used to extract content. By default 'body' and 'head' elements are already included.",
"hints": [
"v1LegacyId:f.filteringRootTags",
"advanced"
],
"items": {
"type": "string"
}
},
"includeSelectors": {
"type": "array",
"title": "Jsoup inclusive selectors",
"description": "Jsoup-formatted selectors for elements to include in the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.includeSelectors"
],
"items": {
"type": "string"
}
},
"includeTags": {
"type": "array",
"title": "Included tags",
"description": "HTML tag names of elements to include in the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.includeTags"
],
"items": {
"type": "string"
}
},
"includeTagClasses": {
"type": "array",
"title": "Included tag classes",
"description": "HTML tag classes of elements to include in the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.includeTagClasses"
],
"items": {
"type": "string"
}
},
"includeTagIDs": {
"type": "array",
"title": "Included tag IDs",
"description": "HTML tag IDs of elements to include in the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.includeTagIDs"
],
"items": {
"type": "string"
}
},
"excludeSelectors": {
"type": "array",
"title": "Jsoup exclusive selectors",
"description": "Jsoup-formatted selectors for elements to exclude from the crawled content. Syntax for jsoup selectors is available at http://jsoup.org/apidocs/org/jsoup/select/Selector.html.",
"hints": [
"advanced",
"v1LegacyId:f.excludeSelectors"
],
"items": {
"type": "string"
}
},
"excludeTags": {
"type": "array",
"title": "Excluded tags",
"description": "HTML tag names of elements to exclude from the crawled content.",
"hints": [
"v1LegacyId:f.excludeTags",
"advanced"
],
"items": {
"type": "string"
}
},
"excludeTagClasses": {
"type": "array",
"title": "Excluded tag classes",
"description": "HTML tag classes of elements to exclude from the crawled content.",
"hints": [
"v1LegacyId:f.excludeTagClasses",
"advanced"
],
"items": {
"type": "string"
}
},
"excludeTagIDs": {
"type": "array",
"title": "Excluded tag IDs",
"description": "HTML tag IDs of elements to exclude from the crawled content.",
"hints": [
"advanced",
"v1LegacyId:f.excludeTagIDs"
],
"items": {
"type": "string"
}
},
"customLinkSelectors": {
"type": "array",
"title": "Custom XPath Link Selectors",
"description": "By default, only standard anchor tags, iframe tags, frame tags, and link tags are fetched. This allows you to use one or more XPath expressions to parse links from custom places. Such as //option/@value",
"hints": [
"v1LegacyId:f.customLinkSelectors"
],
"items": {
"type": "string"
}
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.DocumentParsingConfig"
]
},
"javascriptEvaluationConfig": {
"type": "object",
"title": "Javascript Evaluation Properties",
"required": [],
"properties": {
"crawlJS": {
"type": "boolean",
"title": "Evaluate Javascript",
"description": "Evaluate JavaScript on web pages when crawling. This makes it possible for the Web fetcher to extract content from pages that is only available after JavaScript has prepared the document, but it may make the crawl slower because JavaScript loading can be time consuming.",
"default": false,
"hints": [
"v1LegacyId:f.crawlJS"
]
},
"jsEnabledAuth": {
"type": "boolean",
"title": "Evaluate JavaScript during SmartForms/SAML Login",
"description": "Evaluate JavaScript when doing SAML/SmartForm authentication. This is only applicable if you have specified a SmartForms/SAML Authentication element in the \"Crawl Authentication\" area.",
"default": false,
"hints": [
"v1LegacyId:f.jsEnabledAuth"
]
},
"jsPageLoadTimeout": {
"type": "number",
"title": "Timeout",
"description": "The time to wait in milliseconds for a page load to complete. If the timeout is -1, page loads can be indefinite. Maximum: 180,000ms i.e. 3 minutes",
"default": 20000,
"hints": [
"v1LegacyId:f.jsPageLoadTimeout"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"jsScriptTimeout": {
"type": "number",
"title": "Script Timeout",
"description": "The time to wait in milliseconds wait for an asynchronous script to finish execution. If the timeout is -1, then the script will be allowed to run indefinitely. Maximum: 30,000ms",
"default": 20000,
"hints": [
"v1LegacyId:f.jsScriptTimeout"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"jsAjaxTimeout": {
"type": "number",
"title": "AJAX Timeout",
"description": "The time in milliseconds after which an AJAX request will be ignored when considering whether all AJAX requests have completed. Maximum: 180,000ms i.e. 3 minutes",
"default": 20000,
"hints": [
"v1LegacyId:f.jsAjaxTimeout"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"extraLoadTimeMs": {
"type": "number",
"title": "Extra time to wait for content after page load (ms)",
"description": "The JavaScript evaluation process will first wait for the DOM 'document.readyState' to be set to 'complete'; then it will wait until there are no more pending Ajax before emitting the page’s contents. Use this property to wait an additional number of milliseconds before emitting the contents. This gives background JavaScript routines a chance to finish rendering the page before the contents is emitted.",
"default": 250,
"hints": [
"v1LegacyId:f.extraLoadTimeMs"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"extraPageLoadDeltaChars": {
"type": "number",
"title": "Stop waiting for extraLoadTimeMs if page size increases by this many bytes.",
"description": "This parameter is used when the \"Extra time to wait for content after page load (ms)\" parameter is > 0. It will stop the additional wait time if it sees the web page's content grows by at least this many characters. If set to 0 (the default) any increase in character count indicates the page load is finished.",
"default": 0,
"hints": [
"v1LegacyId:f.extraPageLoadDeltaChars"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"quitTimeoutMs": {
"type": "number",
"title": "Web Driver Quit Timeout (milliseconds)",
"description": "The amount of time to wait for a web browser to quit before killing the browser process.",
"default": 5000,
"hints": [
"advanced",
"v1LegacyId:f.quitTimeoutMs"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"useRequestCounter": {
"type": "boolean",
"title": "Use Request Counter",
"description": "Use the request counter plugin to wait for all pending ajax requests to be complete before loading the page contents.",
"default": true,
"hints": [
"v1LegacyId:f.useRequestCounter"
]
},
"requestCounterMinWaitMs": {
"type": "number",
"title": "Request counter min wait (ms)",
"description": "When the requestcounter is enabled, often early on the requestcount may say there are 0 pending requests... but there may still be ajax requests that haven't run yet. This parameter provides a certain time in milliseconds to wait for a non-zero count to be returned. If a requestcount is non-zero at any point, then the next requestcount = 0 is assumed to signify this page is done loading.",
"default": 750,
"hints": [
"v1LegacyId:f.requestCounterMinWaitMs"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"requestCounterMaxWaitMs": {
"type": "number",
"title": "Request counter max wait (ms)",
"description": "The request counter plugin counts active ajax requests after a page was loaded until there are no more pending ajax requests. This parameter says how long to wait in milliseconds for the requestcount to go to 0 before giving up.",
"default": 20000,
"hints": [
"v1LegacyId:f.requestCounterMaxWaitMs"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"useHighPerfJsEval": {
"type": "boolean",
"title": "High Performance Mode",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"default": false,
"hints": [
"v1LegacyId:f.useHighPerfJsEval",
"hidden"
]
},
"headlessBrowser": {
"type": "boolean",
"title": "Headless Browser",
"description": "Applicable only when \"Evaluate JavaScript\" is selected, deselect this checkbox if you want to actually see browser windows display while fetchers process web pages. Otherwise, if selected, browsers will run in \"headless\" mode which means they will run in the background. If running on a server with no desktop interface, this must stay selected.",
"default": true,
"hints": [
"v1LegacyId:f.headlessBrowser"
]
},
"takeScreenshot": {
"type": "boolean",
"title": "Index a screenshot of rendered page",
"description": "Applicable only when \"Evaluate JavaScript\" is selected, take a screenshot of the fully rendered page and index it. Screenshots will be indexed in a field called \"screenshot_bin\". You must make sure your schema specifies this field as a binary field or indexing will fail. To add this, go to System -> Solr Config -> Managed Schema then add <dynamicField indexed=\"true\" name=\"*_bin\" stored=\"true\" type=\"binary\"/>",
"default": false,
"hints": [
"v1LegacyId:f.takeScreenshot"
]
},
"screenshotFullscreen": {
"type": "boolean",
"title": "Screenshots Full Screen",
"description": "When taking a screenshot, capture the full screen.",
"default": false,
"hints": [
"v1LegacyId:f.screenshotFullscreen"
]
},
"viewportWidth": {
"type": "number",
"title": "Viewport Width",
"description": "Set an optional browser viewport width. If not specified, will default to 800.",
"hints": [
"v1LegacyId:f.viewportWidth"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"viewportHeight": {
"type": "number",
"title": "Viewport Height",
"description": "Set an optional browser viewport height. If not specified, will default to 600.",
"hints": [
"v1LegacyId:f.viewportHeight"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"deviceScreenFactor": {
"type": "number",
"title": "Device Screen Factor",
"description": "Set an optional browser device screen factor. If not specified, will default to 1 (no scaling).",
"hints": [
"v1LegacyId:f.deviceScreenFactor"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"simulateMobile": {
"type": "boolean",
"title": "Simulate mobile",
"description": "Simulate a mobile device",
"default": false,
"hints": [
"v1LegacyId:f.simulateMobile",
"advanced"
]
},
"mobileScreenWidth": {
"type": "number",
"title": "Mobile screen width (Only used for simulate mobile)",
"description": "If simulate mobile is checked, this species the device's emulated screen width.",
"hints": [
"advanced",
"v1LegacyId:f.mobileScreenWidth"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"mobileScreenHeight": {
"type": "number",
"title": "Mobile screen height (Only used for simulate mobile)",
"description": "If simulate mobile is checked, this species the device's emulated screen height.",
"hints": [
"advanced",
"v1LegacyId:f.mobileScreenHeight"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"chromeBinaryPath": {
"type": "string",
"title": "chromeBinaryPath",
"description": "This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.",
"hints": [
"v1LegacyId:f.chromeBinaryPath",
"advanced"
]
},
"chromeExtraCommandLineArgs": {
"type": "string",
"title": "Google Chrome Extra Command Line Options",
"description": "Specify additional command line arguments to add to the chromium executable when it is run.",
"hints": [
"advanced",
"v1LegacyId:f.chromeExtraCommandLineArgs"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.JavascriptEvaluationConfig"
]
},
"linkDiscoveryConfig": {
"type": "object",
"title": "Link Discovery",
"required": [],
"properties": {
"restrictToTreeIgnoredHostPrefixes": {
"type": "array",
"title": "Restrict crawl host prefix exemptions",
"description": "Modifies the behavior of 'Restrict crawl to start-link tree' to ignore the configured list of prefixes when restricting the crawl. Commonly, 'www.' is ignored so links with the same domain are allowed, whether of the form 'http://host.com' or 'http://www.host.com'. This option requires 'Restrict to start-link tree' to be enabled to have any effect.",
"hints": [
"v1LegacyId:restrictToTreeIgnoredHostPrefixes"
],
"items": {
"type": "string"
}
},
"restrictToTree": {
"type": "boolean",
"title": "Restrict crawl to start-link tree",
"description": "If true, only URLs that match the startLinks URL domain will be followed",
"default": true,
"hints": [
"v1LegacyId:restrictToTree"
]
},
"restrictToTreeAllowSubdomains": {
"type": "boolean",
"title": "Ignore sub-domains when restricting crawl",
"description": "Modifies the behavior of 'Restrict crawl to start-link tree' so that a link to any sub-domain of the start links is allowed. For example, if the start link is 'http://host.com', this option ensures that links to 'http://news.host.com' are also followed. This option requires 'Restrict to start-link tree' to be enabled to have any effect.",
"default": false,
"hints": [
"v1LegacyId:restrictToTreeAllowSubdomains"
]
},
"restrictToTreeUseHostAndPath": {
"type": "boolean",
"title": "Restrict crawl to start-link path",
"description": "Modifies the behavior of 'Restrict crawl to start-link tree' to include the 'path' of the start link in the restriction logic. For example, if the start link is 'http://host.com/US', this option will limit all followed URLs to ones starting with the '/US/' path. This option requires 'Restrict to start-link tree' to be enabled to have any effect.",
"default": false,
"hints": [
"v1LegacyId:restrictToTreeUseHostAndPath"
]
},
"sitemapURLs": {
"type": "array",
"title": "Sitemap URLs",
"description": "URLs for sitemaps, to be used a basis for link discovery. Rules found in sitemaps will not be processed.",
"hints": [
"v1LegacyId:f.sitemapURLs"
],
"items": {
"type": "string"
}
},
"respectMetaEquivRedirects": {
"type": "boolean",
"title": "Respect refresh redirects",
"description": "If true, the connector will follow metatags with refresh redirects such as <meta http-equiv=\"refresh\" />.",
"default": false,
"hints": [
"v1LegacyId:f.respectMetaEquivRedirects"
]
},
"allowCircularRedirects": {
"type": "boolean",
"title": "Allow circular redirects",
"description": "If true, a request can be redirected to the same URL multiple times",
"default": false,
"hints": [
"v1LegacyId:f.allowCircularRedirects"
]
},
"addedHeaders": {
"type": "string",
"title": "Headers to add to HTTP requests",
"description": "Add these headers to http requests. This is useful for web sites that require certain headers to let you visit them. Write each header on its own line in the format HeaderName: HeaderValue",
"hints": [
"lengthy",
"v1LegacyId:f.addedHeaders"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.LinkDiscoveryConfig"
]
},
"recrawlRulesConfig": {
"type": "object",
"title": "Recrawl Rules",
"required": [],
"properties": {
"delete": {
"type": "boolean",
"title": "Delete dead URIs",
"description": "Set to true to remove documents from the index when they can no longer be accessed as unique documents.",
"default": true,
"hints": [
"v1LegacyId:delete",
"advanced"
]
},
"deleteErrorsAfter": {
"type": "number",
"title": "Fetch failure allowance",
"description": "Number of times a website can error out, for example with a 500 error or a connection timeout, before a document is removed from the index. The default of -1 means such documents are never removed. Note that pages that return a 404 status code can be configured to be removed immediately regardless of this setting.",
"default": -1,
"hints": [
"v1LegacyId:deleteErrorsAfter",
"advanced"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"refreshAll": {
"type": "boolean",
"title": "Recrawl all items",
"description": "Set to true to always recrawl all items found in the crawldb.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:refreshAll"
]
},
"refreshStartLinks": {
"type": "boolean",
"title": "Recrawl start links",
"description": "Set to true to recrawl items specified in the list of start links.",
"default": false,
"hints": [
"v1LegacyId:refreshStartLinks",
"advanced"
]
},
"refreshErrors": {
"type": "boolean",
"title": "Recrawl errors",
"description": "Set to true to recrawl items that failed during the last crawl.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:refreshErrors"
]
},
"refreshOlderThan": {
"type": "number",
"title": "Recrawl age",
"description": "Number of seconds to recrawl items whose last fetched date is longer ago than this value.",
"default": -1,
"hints": [
"advanced",
"v1LegacyId:refreshOlderThan"
],
"maximum": 2147483647,
"exclusiveMaximum": false,
"minimum": -2147483648,
"exclusiveMinimum": false,
"multipleOf": 1
},
"refreshIDPrefixes": {
"type": "array",
"title": "Recrawl ID prefixes",
"description": "A prefix to recrawl all items whose IDs begin with this value.",
"hints": [
"advanced",
"v1LegacyId:refreshIDPrefixes"
],
"items": {
"type": "string"
}
},
"refreshIDRegexes": {
"type": "array",
"title": "Recrawl ID regexes",
"description": "A regular expression to recrawl all items whose IDs match this pattern.",
"hints": [
"v1LegacyId:refreshIDRegexes",
"advanced"
],
"items": {
"type": "string"
}
},
"refreshScript": {
"type": "string",
"title": "Recrawl script",
"description": "A JavaScript function ('shouldRefresh()') to customize the items recrawled. ",
"hints": [
"code",
"advanced",
"v1LegacyId:refreshScript",
"javascript"
]
},
"forceRefresh": {
"type": "boolean",
"title": "Force recrawl",
"description": "Set to true to recrawl all items even if they have not changed since the last crawl.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:forceRefresh"
]
},
"forceRefreshClearSignatures": {
"type": "boolean",
"title": "Clear signatures",
"description": "If true, signatures will be cleared if force recrawl is enabled.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:forceRefreshClearSignatures"
]
},
"delete404": {
"type": "boolean",
"title": "Remove 404/410 pages",
"description": "Select this option to delete indexed pages that return a 404 or 410 error.",
"default": true,
"hints": [
"advanced",
"v1LegacyId:delete404"
]
},
"sitemapIncrementalCrawling": {
"type": "boolean",
"title": "Process Sitemap URLs",
"description": "When enabled, only URLs found in the sitemap will be processed and crawled.",
"default": false,
"hints": [
"advanced",
"v1LegacyId:sitemap_incremental_crawling"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.RecrawlRulesConfig"
]
},
"cookieSpec": {
"type": "string",
"title": "Cookie spec",
"default": "browser-compatibility",
"hints": [
"v1LegacyId:f.cookieSpec"
]
},
"rewriteLinkScript": {
"type": "string",
"title": "URI rewrite script",
"description": "A Javascript function 'rewriteLink(link) { }' to modify links to documents before they are fetched.",
"hints": [
"code",
"advanced",
"v1LegacyId:rewriteLinkScript",
"javascript"
]
}
},
"interfaces": [
"com.lucidworks.connector.plugins.web.config.WebConfig$Properties",
"com.lucidworks.fusion.connector.plugin.api.config.ConnectorPluginProperties"
]
},
"id": {
"type": "string",
"title": "Configuration ID",
"description": "A unique identifier for this Configuration.",
"minLength": 1,
"pattern": "^[a-zA-Z0-9_-]+$"
},
"pipelineId": {
"type": "string",
"title": "Pipeline ID",
"description": "Name of the IndexPipeline used for processing output.",
"default": "lucidworks-web",
"minLength": 1,
"pattern": "^[a-zA-Z0-9_-]+$"
},
"parserId": {
"type": "string",
"title": "Parser ID",
"description": "The Parser to use in the associated IndexPipeline.",
"default": "lucidworks-web",
"pattern": "^[a-zA-Z0-9_-]+$"
},
"description": {
"type": "string",
"title": "Description",
"description": "Optional description",
"hints": [
"lengthy"
],
"maxLength": 125
},
"type": {
"type": "string",
"title": "Type",
"description": "A type ID for this connector.",
"hints": [
"readonly",
"hidden"
]
},
"created": {
"type": "string",
"title": "Date Created",
"description": "The date at which this Configuration was created.",
"hints": [
"readonly",
"hidden"
]
},
"modified": {
"type": "string",
"title": "Date Modified",
"description": "The date at which this Configuration was last modified.",
"hints": [
"readonly",
"hidden"
]
},
"diagnosticLogging": {
"type": "boolean",
"title": "Diagnostic Logging",
"description": "Enable diagnostic logging; disabled by default",
"default": false
},
"collectionId": {
"type": "string",
"title": "Collection ID",
"description": "The associated content Collection.",
"hints": [
"readonly",
"hidden"
],
"minLength": 1,
"pattern": "^[a-zA-Z0-9_-]+$"
},
"coreProperties": {
"type": "object",
"title": "Core Properties",
"description": "Common behavior and performance settings.",
"required": [],
"properties": {
"fetchSettings": {
"type": "object",
"title": "Fetch Settings",
"description": "System level settings for controlling fetch behavior and performance.",
"required": [],
"properties": {
"indexingThreads": {
"type": "number",
"title": "Index Subscription Threads",
"description": "Maximum number of indexing threads; defaults to 4.This setting controls the number of threads in the indexing service used for processing content documents emitted by this datasource.Higher values can sometimes help with overall fetch performance.",
"default": 4,
"maximum": 10,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
},
"pluginInstances": {
"type": "number",
"title": "Number of plugin instances for distributed fetching",
"description": "Maximum number of plugin instances for distributed fetching. Only specified number of plugin instanceswill do fetching. This is useful for distributing load between different instances.",
"default": 0,
"maximum": 500,
"exclusiveMaximum": false,
"minimum": 0,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchItemQueueSize": {
"type": "number",
"title": "Fetch Item Queue Size",
"description": "Size of the fetch item queue.Larger values result in increased memory usage, but potentially higher performance.Default is 10k.",
"default": 10000,
"hints": [
"hidden"
],
"maximum": 500000,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchRequestCheckInterval": {
"type": "number",
"title": "Fetch request check interval(ms)",
"description": "The amount of time to wait before check if a request is done",
"default": 15000,
"hints": [
"hidden"
],
"maximum": 500000,
"exclusiveMaximum": false,
"minimum": 1000,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchResponseScheduledTimeout": {
"type": "number",
"title": "Fetch response scheduled timeout(ms)",
"description": "The maximum amount of time for a response to be scheduled. The task will be canceled if this setting is exceeded.",
"default": 300000,
"maximum": 500000,
"exclusiveMaximum": false,
"minimum": 1000,
"exclusiveMinimum": false,
"multipleOf": 1
},
"fetchResponseCompletedTimeout": {
"type": "number",
"title": "Fetch response completion timeout(ms)",
"description": "The maximum amount of time for a response to be completed. If exceeded, the task will be retried if the job is still running",
"default": 300000,
"hints": [
"hidden"
],
"maximum": 600000,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
},
"indexingInactivityTimeout": {
"type": "number",
"title": "Indexing inactivity timeout(seconds)",
"description": "The maximum amount of time to wait for indexing results (in seconds). If exceeded, the job will fail with an indexing inactivity timeout.",
"default": 86400,
"maximum": 691200,
"exclusiveMaximum": false,
"minimum": 60,
"exclusiveMinimum": false,
"multipleOf": 1
},
"pluginInactivityTimeout": {
"type": "number",
"title": "Plugin inactivity timeout(seconds)",
"description": "The maximum amount of time to wait for plugin activity (in seconds). If exceeded, the job will fail with a plugin inactivity timeout.",
"default": 600,
"maximum": 691200,
"exclusiveMaximum": false,
"minimum": 60,
"exclusiveMinimum": false,
"multipleOf": 1
},
"indexMetadata": {
"type": "boolean",
"title": "Index metadata",
"description": "When enabled the metadata of skipped items will be indexed to the content collection.",
"default": false
},
"indexContentFields": {
"type": "boolean",
"title": "Index content fields",
"description": "When enabled, content fields will be indexed to the crawl-db collection.",
"default": false
},
"asyncParsing": {
"type": "boolean",
"title": "Async Parsing",
"description": "When enabled, content will be indexed asynchronously.",
"default": false
},
"numFetchThreads": {
"type": "number",
"title": "Fetch Threads",
"description": "Maximum number of fetch threads; defaults to 5.This setting controls the number of threads that call the Connectors fetch method.Higher values can, but not always, help with overall fetch performance.",
"default": 5,
"maximum": 500,
"exclusiveMaximum": false,
"minimum": 1,
"exclusiveMinimum": false,
"multipleOf": 1
}
},
"interfaces": [
"com.lucidworks.fusion.connector.plugin.api.config.CoreConfig$FetchSystemSettings"
]
}
},
"interfaces": [
"com.lucidworks.fusion.connector.plugin.api.config.CoreConfig"
],
"hints": [
"advanced"
]
}
},
"category": "Web",
"interfaces": [
"com.lucidworks.connector.plugins.web.config.WebConfig",
"com.lucidworks.fusion.connector.plugin.api.config.ConnectorConfig"
]
}
}
The connector type.
OK
The response is of type object
.
Was this page helpful?