aliasExpiration
Alias expiration
|
The number of crawls after which an alias will expire. The default is 1 crawl.
type: integer
default value: '1 '
|
chunkSize
Fetch batch size
|
The number of items to batch for each round of fetching. A higher value can make crawling faster, but memory usage is also increased. The default is 1.
type: integer
default value: '1 '
|
collection
Collection
|
Collection documents will be indexed to.
type: string
pattern: ^[a-zA-Z0-9_-]+$
|
commitAfterItems
Commit After This Many Items
|
Commit the crawlDB to disk after this many items have been received. A smaller number here will result in a slower crawl because of commits to disk being more frequent; conversely, a larger number here will cause a resumed job after a crash to need to recrawl more records.
type: integer
default value: '10000 '
|
crawlDBType
Crawl database type
|
The type of crawl database to use, in-memory or on-disk.
type: string
default value: 'on-disk '
enum: {
in-memory
on-disk
}
|
dedupe
Dedupe documents
|
If true, documents will be deduplicated. Deduplication can be done based on an analysis of the content, on the content of a specific field, or by a JavaScript function. If neither a field nor a script are defined, content analysis will be used.
type: boolean
default value: 'false '
|
dedupeField
Dedupe field
|
Field to be used for dedupe. Define either a field or a dedupe script, otherwise the full raw content of each document will be used.
type: string
|
dedupeSaveSignature
Save dedupe signature
|
If true,the signature used for dedupe will be stored in a 'dedupeSignature_s' field. Note this may cause errors about 'immense terms' in that field.
type: boolean
default value: 'false '
|
dedupeScript
Dedupe script
|
Custom javascript to dedupe documents. The script must define a 'genSignature(content){}' function, but can use any combination of document fields. The function must return a string.
type: string
|
delete
Delete dead URIs
|
Set to true to remove documents from the index when they can no longer be accessed as unique documents.
type: boolean
default value: 'true '
|
deleteErrorsAfter
Fetch failure allowance
|
Number of fetch failures to tolerate before removing a document from the index. The default of -1 means no fetch failures will be removed.
type: integer
default value: '-1 '
|
depth
Max crawl depth
|
Number of levels in a directory or site tree to descend for documents.
type: integer
default value: '-1 '
|
diagnosticMode
Diagnostic mode
|
Enable to print more detailed information to the logs about each request.
type: boolean
default value: 'false '
|
emitThreads
Emit threads
|
The number of threads used to send documents from the connector to the index pipeline. The default is 5.
type: integer
default value: '5 '
|
excludeExtensions
Excluded file extensions
|
File extensions that should not to be fetched. This will limit this datasource to all extensions except this list.
type: array of string
|
excludeRegexes
Exclusive regexes
|
Regular expressions for URI patterns to exclude. This will limit this datasource to only URIs that do not match the regular expression.
type: array of string
|
f.addFileMetadata
Add file metadata
|
Set to true to add information about documents found in the filesystem to the document, such as document owner, group, or ACL permissions.
type: boolean
default value: 'true '
|
f.fs.domain
Domain
|
Windows File Share Domain
type: string
|
f.fs.password
Password
required
|
Windows File Share Password
type: string
|
f.fs.socket_timeout
Socket timeout
|
Socket Timeout (default is 0 seconds, blocks forever)
type: integer
default value: '0 '
exclusiveMinimum: false
minimum: 0
|
f.fs.timeout
Timeout
|
Timeout sets Read, Write, and Transact timeout (default is 60 seconds)
type: integer
default value: '60 '
exclusiveMinimum: false
minimum: 0
|
f.fs.username
Username
required
|
Windows File Share Username
type: string
|
f.fs.verify_access
Validate access
|
Set to true to require successful connection to the filesystem before saving this datasource.
type: boolean
default value: 'true '
|
f.index_items_discarded
Index discarded document metadata
|
Enable to index discarded document metadata
type: boolean
default value: 'false '
|
f.maxSizeBytes
Maximum file size (bytes)
|
Maximum size (in bytes) of documents to fetch or -1 for unlimited file size.
type: integer
default value: '4194304 '
|
f.minSizeBytes
Minimum file size (bytes)
|
Minimum size, in bytes, of documents to fetch.
type: integer
default value: '0 '
|
failFastOnStartLinkFailure
Fail crawl if start links are invalid
|
If true, when Fusion cannot connect to any of the provided start links, the crawl is stopped and an exception logged.
type: boolean
default value: 'true '
|
fetchDelayMS
Fetch delay
|
Number of milliseconds to wait between fetch requests. The default is 0. This property can be used to throttle a crawl if necessary.
type: integer
default value: '0 '
|
fetchThreads
Fetch threads
|
The number of threads to use during fetching. The default is 5.
type: integer
default value: '5 '
|
forceRefresh
Force recrawl
|
Set to true to recrawl all items even if they have not changed since the last crawl.
type: boolean
default value: 'false '
|
forceRefreshClearSignatures
Clear signatures
|
If true, signatures will be cleared if force recrawl is enabled.
type: boolean
default value: 'true '
|
includeExtensions
Included file extensions
|
File extensions to be fetched. This will limit this datasource to only these file extensions.
type: array of string
|
includeRegexes
Inclusive regexes
|
Regular expressions for URI patterns to include. This will limit this datasource to only URIs that match the regular expression.
type: array of string
|
maxItems
Max items
|
Maximum number of documents to fetch. The default (-1) means no limit.
type: integer
default value: '-1 '
|
reevaluateCrawlDbOnStart
Reevaluate crawldb on start?
|
Reevaluate existing crawldb entries for legality on startup?
type: boolean
default value: 'false '
|
refreshAll
Recrawl all items
|
Set to true to always recrawl all items found in the crawldb.
type: boolean
default value: 'true '
|
refreshErrors
Recrawl errors
|
Set to true to recrawl items that failed during the last crawl.
type: boolean
default value: 'false '
|
refreshIDPrefixes
Recrawl ID prefixes
|
A prefix to recrawl all items whose IDs begin with this value.
type: array of string
|
refreshIDRegexes
Recrawl ID regexes
|
A regular expression to recrawl all items whose IDs match this pattern.
type: array of string
|
refreshOlderThan
Recrawl age
|
Number of seconds to recrawl items whose last fetched date is longer ago than this value.
type: integer
default value: '-1 '
|
refreshScript
Recrawl script
|
A JavaScript function ('shouldRefresh()') to customize the items recrawled.
type: string
|
refreshStartLinks
Recrawl start links
|
Set to true to recrawl items specified in the list of start links.
type: boolean
default value: 'false '
|
retainOutlinks
Retain links in the crawldb
|
Set to true for links found during fetching to be stored in the crawldb. This increases precision in certain recrawl scenarios, but requires more memory and disk space.
type: boolean
default value: 'false '
|
retryEmit
Retry emits
|
Set to true for emit batch failures to be retried on a document-by-document basis.
type: boolean
default value: 'true '
|
rewriteLinkScript
URI rewrite script
|
A Javascript function 'rewriteLink(link) { }' to modify links to documents before they are fetched.
type: string
|
startLinks
Start Links
required
|
One or more starting URIs for this datasource.
type: array of string
|