f.addedHeaders
Headers to add to HTTP requests
|
Add these headers to http requests. This is useful for web sites that require certain headers to let you visit them. Write each header on its own line in the format HeaderName: HeaderValue
type: string
|
f.allowAllCertificates
Allow all HTTPS certificates
|
If false, security checks will be performed on all SSL/TLS certificate signers and origins. This means self-signed certificates would not be supported.
type: boolean
default value: 'false '
|
f.allowCircularRedirects
Allow circular redirects
|
If true, a request can be redirected to the same URL multiple times
type: boolean
default value: 'false '
|
f.appendTrailingSlashToLinks
Add trailing slash to link URLs
|
If true, a trailing '/' will be added to link URLs when the URL does not end in a dot ('.').
type: boolean
default value: 'false '
|
f.basicAuth
Basic Authentication
|
Settings for Basic authentiation
type: array of object
object attributes: {
host
(required)
: {
display name: Host
type: string
description : The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)
}
id
: {
display name: Auth Config id
type: string
description : Auth Config id
}
password
: {
display name: Password
type: string
description : The password for the user
}
port
(required)
: {
display name: Port
type: integer
description : The port. You can specify -1 to authenticate against any port
}
realm
: {
display name: Realm
type: string
description : The realm for the site, if any
}
userName
: {
display name: User
type: string
description : The username to use for authentication
}
}
|
f.bulkStartLinks
Bulk Start Links
|
If a large number of start links must be defined, you can provide them here. One link per line.
type: string
|
f.canonicalTagsRedirectLimit
The maximum number of requests to perform while resolving the canonical URL for a page.
|
Because canonical tag resolution may be cyclical, a limit must be applied to the total number of requests. This value ensures that the resolution finishes in a reasonable amount of time.
type: integer
default value: '4 '
|
f.chromeBinaryPath
chromeBinaryPath
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.chromeExtraCommandLineArgs
Google Chrome Extra Command Line Options
|
Specify additional command line arguments to add to the chromium executable when it is run.
type: string
|
f.cookieSpec
Cookie spec
|
type: string
default value: 'browser-compatibility '
enum: {
browser-compatibility
rfc-2965
best-match
ignore-all
}
|
f.crawlJS
Evaluate Javascript
|
Evaluate JavaScript on web pages when crawling. This makes it possible for the Web fetcher to extract content from pages that is only available after JavaScript has prepared the document, but it may make the crawl slower because JavaScript loading can be time consuming.
type: boolean
default value: 'false '
|
f.credentialsFile
credentialsFile
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.customLinkSelectors
Custom XPath Link Selectors
|
By default, only standard anchor tags, iframe tags, frame tags, and link tags are fetched. This allows you to use one or more XPath expressions to parse links from custom places. Such as //option/@value
type: array of string
|
f.defaultCharSet
Default character set
|
Default character set to use when one is not declared in the HTTP headers.
type: string
default value: 'UTF-8 '
|
f.defaultMIMEType
Default MIME type
|
Default MIME type to use when one is not declared in the HTTP headers.
type: string
default value: 'application/octet-stream '
|
f.deviceScreenFactor
Device Screen Factor
|
Set an optional browser device screen factor. If not specified, will default to 1 (no scaling).
type: integer
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 99999
minimum: 1
|
f.digestAuth
Digest Authentication
|
Settings for Digest authentication
type: array of object
object attributes: {
host
(required)
: {
display name: Host
type: string
description : The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)
}
id
: {
display name: Auth Config id
type: string
description : Auth Config id
}
password
: {
display name: Password
type: string
description : The password for the user
}
port
(required)
: {
display name: Port
type: integer
description : The port. You can specify -1 to authenticate against any port
}
realm
: {
display name: Realm
type: string
description : The realm for the site, if any
}
userName
: {
display name: User
type: string
description : The username to use for authentication
}
}
|
f.discardLinkURLAnchors
Discard anchors in link URLs
|
If true, anchors found in URLs will be removed before being added to the discovery queue.
type: boolean
default value: 'true '
|
f.discardLinkURLQueries
Discard queries in link URLs
|
If true, query parameters found in URLs will be removed before being added to the discovery queue.
type: boolean
default value: 'false '
|
f.excludeSelectors
Jsoup exclusive selectors
|
Jsoup-formatted selectors for elements to exclude from the crawled content. Syntax for jsoup selectors is available at http://jsoup.org/apidocs/org/jsoup/select/Selector.html.
type: array of string
|
f.excludeTagClasses
Excluded tag classes
|
HTML tag classes of elements to exclude from the crawled content.
type: array of string
|
f.excludeTagIDs
Excluded tag IDs
|
HTML tag IDs of elements to exclude from the crawled content.
type: array of string
|
f.excludeTags
Excluded tags
|
HTML tag names of elements to exclude from the crawled content.
type: array of string
|
f.extraLoadTimeMs
Extra time to wait for content after page load (ms)
|
The JavaScript evaluation process will first wait for the DOM 'document.readyState' to be set to 'complete'; then it will wait until there are no more pending Ajax before emitting the page’s contents. Use this property to wait an additional number of milliseconds before emitting the contents. This gives background JavaScript routines a chance to finish rendering the page before the contents is emitted.
type: integer
default value: '250 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 2147483647
minimum: -1
|
f.extraPageLoadDeltaChars
Stop waiting for extraLoadTimeMs if page size increases by this many bytes.
|
This parameter is used when the "Extra time to wait for content after page load (ms)" parameter is > 0. It will stop the additional wait time if it sees the web page's content grows by at least this many characters. If set to 0 (the default) any increase in character count indicates the page load is finished.
type: integer
default value: '0 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 2147483647
minimum: 0
|
f.filteringRootTags
Root elements to filter
|
Root HTML elements whose child elements will be used to extract content. By default 'body' and 'head' elements are already included.
type: array of string
default value: 'bodyhead '
|
f.firefoxBinaryPath
Firefox Custom Binary Path [DEPRECATED]
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.firefoxHeadlessBrowser
Run Firefox in Headless Mode [DEPRECATED]
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: boolean
default value: 'true '
|
f.followCanonicalTags
Deduplication via canonical tag
|
Deduplicate, by only indexing the document at the URL specified in the canonical tag. https://en.wikipedia.org/wiki/Canonical_link_element
type: boolean
default value: 'false '
|
f.formAuth
Form Authentication
|
Settings for Form based authentication
type: array of object
object attributes: {
action
(required)
: {
display name: URL
type: string
description : The URL of the authentication endpoint
}
id
: {
display name: Auth Config id
type: string
description : Auth Config id
}
params
: {
display name: Parameters
type: object
:
}
password
: {
display name: Password
type: string
description : The password to use for the authentication request. This will be copied into the "Parameters" using the "Password Parameter" name as the key
}
passwordParamName
: {
display name: Password Parameter
type: string
description : Name of the parameter containing the password
}
ttl
: {
display name: TTL (ms)
type: number
description : The "time to live" in milliseconds for the session that will be created after authentication.
}
}
|
f.headlessBrowser
Headless Browser
|
Applicable only when "Evaluate JavaScript" is selected, deselect this checkbox if you want to actually see browser windows display while fetchers process web pages. Otherwise, if selected, browsers will run in "headless" mode which means they will run in the background. If running on a server with no desktop interface, this must stay selected.
type: boolean
default value: 'true '
|
f.includeSelectors
Jsoup inclusive selectors
|
Jsoup-formatted selectors for elements to include in the crawled content.
type: array of string
|
f.includeTagClasses
Included tag classes
|
HTML tag classes of elements to include in the crawled content.
type: array of string
|
f.includeTagIDs
Included tag IDs
|
HTML tag IDs of elements to include in the crawled content.
type: array of string
|
f.includeTags
Included tags
|
HTML tag names of elements to include in the crawled content.
type: array of string
|
f.index_items_discarded
Index discarded document metadata
|
Enable to index discarded document metadata
type: boolean
default value: 'false '
|
f.jsAjaxTimeout
AJAX Timeout
|
The time in milliseconds after which an AJAX request will be ignored when considering whether all AJAX requests have completed. Maximum: 180,000ms i.e. 3 minutes
type: integer
default value: '20000 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 180000
minimum: -1
|
f.jsEnabledAuth
Evaluate JavaScript during SmartForms/SAML Login
|
Evaluate JavaScript when doing SAML/SmartForm authentication. This is only applicable if you have specified a SmartForms/SAML Authentication element in the "Crawl Authentication" area.
type: boolean
default value: 'false '
|
f.jsPageLoadTimeout
Timeout
|
The time to wait in milliseconds for a page load to complete. If the timeout is -1, page loads can be indefinite. Maximum: 180,000ms i.e. 3 minutes
type: integer
default value: '20000 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 180000
minimum: -1
|
f.jsScriptTimeout
Script Timeout
|
The time to wait in milliseconds wait for an asynchronous script to finish execution. If the timeout is -1, then the script will be allowed to run indefinitely. Maximum: 30,000ms
type: integer
default value: '20000 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 180000
minimum: -1
|
f.kerberosEnabled
kerberosEnabled
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: boolean
|
f.kerberosKeytabBase64
kerberosKeytabBase64
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.kerberosKeytabFile
kerberosKeytabFile
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.kerberosLoginContextName
kerberosLoginContextName
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.kerberosPassword
kerberosPassword
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.kerberosPrincipal
kerberosPrincipal
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.kerberosSpn
kerberosSpn
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: string
|
f.maintainCookies
Maintain Cookies for No-Auth Crawls
|
If you are not using authentication, then by default cookies are not stored in between web requests (stateless). If checked, cookies will be maintained between requests during the web crawl even when you are not using authentication. If you are using authentication, this checkbox has no effect on the crawl and can be ignored.
type: boolean
default value: 'false '
|
f.maxSizeBytes
Max file size (bytes)
|
Maximum size, in bytes, of a document to fetch.
type: integer
default value: '4194304 '
|
f.mobileScreenHeight
Mobile screen height (Only used for simulate mobile)
|
If simulate mobile is checked, this species the device'semulated screen height.
type: integer
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 9999999
minimum: 1
|
f.mobileScreenWidth
Mobile screen width (Only used for simulate mobile)
|
If simulate mobile is checked, this species the device'semulated screen width.
type: integer
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 9999999
minimum: 1
|
f.ntlmAuth
NTLM Authentication
|
Settings for NTLM authentication
type: array of object
object attributes: {
domain
: {
display name: Domain
type: string
description : The NTLM Domain
}
host
(required)
: {
display name: Host
type: string
description : The host of the site. You can specify * to authenticate against any host. (Warning authentication cookies will be send to any host)
}
id
: {
display name: Auth Config id
type: string
description : Auth Config id
}
password
: {
display name: Password
type: string
description : The password for the user
}
port
(required)
: {
display name: Port
type: integer
description : The port. You can specify -1 to authenticate against any port
}
realm
: {
display name: Realm
type: string
description : The realm for the site, if any
}
userName
: {
display name: User
type: string
description : The username to use for authentication
}
workstation
: {
display name: Workstation
type: string
description : The NTLM Workstation name
}
}
|
f.obeyCharSet
Obey server-supplied charset
|
Use the encoding sent by the web server (if any) when parsing content. If unset, Fusion will try to guess the character set when parsing.
type: boolean
default value: 'true '
|
f.obeyLinkNofollow
Obey link nofollow attributes
|
If true, rel='nofollow' on links are obeyed
type: boolean
default value: 'true '
|
f.obeyRobots
Obey robots.txt
|
If true, Allow, Disallow and other rules found in a robots.txt file will be obeyed.
type: boolean
default value: 'true '
|
f.obeyRobotsDelay
Obey robots.txt Crawl-Delay
|
If true, Crawl-Delay rules in robots.txt will be obeyed. Disabling this option will speed up crawling, but is considered negative behavior for sites you do not control.
type: boolean
default value: 'true '
|
f.obeyRobotsMeta
Obey robots meta tags and headers
|
If true, rules like 'noindex', 'nofollow' and others found in a robots meta tag on a page or in the headers of the HTTP response are obeyed.
type: boolean
default value: 'true '
|
f.proxy
HTTP proxy address
|
Address of the HTTP proxy, if required. This should be entered in the format host:port.
type: string
|
f.quitTimeoutMs
Web Driver Quit Timeout (milliseconds)
|
The amount of time to wait for a web browser to quit before killing the browser process.
type: integer
default value: '5000 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 9999999
minimum: -1
|
f.requestCounterMaxWaitMs
Request counter max wait (ms)
|
The request counter plugin counts active ajax requests after a page was loaded until there are no more pending ajax requests. This parameter says how long to wait in milliseconds for the requestcount to go to 0 before giving up.
type: integer
default value: '20000 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 99999999
minimum: 1
|
f.requestCounterMinWaitMs
Request counter min wait (ms)
|
When the requestcounter is enabled, often early on the requestcount may say there are 0 pending requests... but there may still be ajax requests that haven't run yet. This parameter provides a certain time in milliseconds to wait for a non-zero count to be returned. If a requestcount is non-zero at any point, then the next requestcount = 0 is assumed to signify this page is done loading.
type: integer
default value: '5000 '
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 99999999
minimum: 0
|
f.requestRetryCount
Request Retry Count
|
If an http request fails, retry up to this many times before giving up. If set to 0, requests will not be retried. This is useful in situations where your crawls are failing with errors like "The target server failed to respond".
type: integer
default value: '0 '
|
f.respectMetaEquivRedirects
Respect refresh redirects
|
If true, the connector will follow metatags with refresh redirects such as .
type: boolean
default value: 'false '
|
f.samlAuth
SAML/Smart Form Authentication
|
Settings for SAML/Smart Form based authentication allows you to visit one or more web pages that contain form inputs such as username, password, security questions, etc., submitting each one in turn in order to become authenticated.
type: array of object
object attributes: {
action
(required)
: {
display name: URL
type: string
description : The URL of the authentication endpoint
}
id
: {
display name: Auth Config id
type: string
description : Auth Config id
}
params
: {
display name: Parameters
type: object
:
}
password
: {
display name: Password
type: string
description : The password to use for the authentication request. This will be copied into the "Parameters" using the "Password Parameter" name as the key
}
passwordParamName
: {
display name: Password Parameter
type: string
description : Name of the parameter containing the password
}
ttl
: {
display name: TTL (ms)
type: number
description : The "time to live" in milliseconds for the session that will be created after authentication.
}
}
|
f.scrapeLinksBeforeFiltering
Scrape links before filtering
|
If true, links will be extracted from documents before any other document processing has ocurred. By default, links are extracted after all other document processing.
type: boolean
default value: 'false '
|
f.screenshotFullscreen
Screenshots Full Screen
|
When taking a screenshot, capture the full screen.
type: boolean
default value: 'false '
|
f.selectorFields
Jsoup selector fields
|
List of Jsoup selectors for elements to put into their separate field in the index. The field will have the same name as the element. Syntax for jsoup selectors is available at http://jsoup.org/apidocs/org/jsoup/select/Selector.html.
type: array of string
|
f.simulateMobile
Simulate mobile
|
Simulate a mobile device
type: boolean
default value: 'false '
|
f.sitemapURLs
Sitemap URLs
|
URLs for sitemaps, to be used a basis for link discovery. Rules found in sitemaps will not be processed.
type: array of string
|
f.tagClassFields
Tag-class fields
|
HTML tag classes of elements to put into their own field in the index. The field will have the same name as the tag class.
type: array of string
|
f.tagFields
Tag fields
|
HTML tags of elements to put into their own field in the index. The field will have the same name as the tag.
type: array of string
|
f.tagIDFields
Tag-ID fields
|
HTML tag IDs of elements to put into their own field in the index. The field will have the same name as the tag ID.
type: array of string
|
f.takeScreenshot
Index a screenshot of rendered page
|
Applicable only when "Evaluate JavaScript" is selected, take a screenshot of the fully rendered page and index it. Screenshots will be indexed in a field called "screenshot_bin". You must make sure your schema specifies this field as a binary field or indexing will fail. To add this, go to System -> Solr Config -> Managed Schema then add
type: boolean
default value: 'false '
|
f.timeoutMS
Connection timeout (ms)
|
Time in milliseconds to wait for server response.
type: integer
default value: '10000 '
|
f.useFirefox
Use Firefox [DEPRECATED]
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: boolean
default value: 'false '
|
f.useHighPerfJsEval
High Performance Mode
|
This property is no longer in use, and is only in place due to backwards compatible configuration validation purposes.
type: boolean
default value: 'false '
|
f.useIpAddressForSslConnections
Use IP address for SSL connections
|
Use IP address instead of host name for SSLconnections. This is used to work around misconfigured HTTP server throwing 'unrecognized name' error whenSNI is enabled. (This only works if 'Allow all certificates' setting is also enabled)
type: boolean
default value: 'false '
|
f.useRequestCounter
Use Request Counter
|
Use the request counter plugin to wait for all pending ajax requests to be complete before loading the page contents.
type: boolean
default value: 'true '
|
f.userAgentEmail
HTTP user-agent email address
|
Email address to use as part of connector identification.
type: string
|
f.userAgentName
HTTP user-agent name
|
Name the connector should use when identifying itself to a website in order to crawl it.
type: string
default value: 'Lucidworks-Anda/2.0 '
|
f.userAgentWebAddr
HTTP user-agent web address
|
Web address to use as part of connector identification.
type: string
|
f.viewportHeight
Viewport Height
|
Set an optional browser viewport height. If not specified, will default to 600.
type: integer
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 9999999
minimum: 1
|
f.viewportWidth
Viewport Width
|
Set an optional browser viewport width. If not specified, will default to 800.
type: integer
exclusiveMaximum: false
exclusiveMinimum: false
maximum: 9999999
minimum: 1
|
startLinks
Start Links
|
The URL(s) that the crawler will start crawling from, for example: https://en.wikipedia.org/wiki/Main_Page
type: array of string
|