add_failed_docs
Add failed documents
|
Set to true to add documents even if they partially fail processing. Failed documents will be added with as much metadata as available, but may not include all expected fields.
type: boolean
default value: 'false '
|
bounds
Crawl bounds
|
Limits the crawl to a specific directory sub-tree, hostname or domain.
type: string
default value: 'tree '
enum: {
tree
host
domain
none
}
|
collection
Collection
|
Collection documents will be indexed to.
type: string
pattern: ^[a-zA-Z0-9_-]+$
|
commit_on_finish
Solr commit on finish
|
Set to true for a request to be sent to Solr after the last batch has been fetched to commit the documents to the index.
type: boolean
default value: 'true '
|
crawl_depth
Max crawl depth
|
Number of levels in a directory or site tree to descend for documents.
type: integer
default value: '-1 '
exclusiveMinimum: false
minimum: -1
|
crawl_item_timeout
Fetch timeout
|
Time in milliseconds to fetch any individual document.
type: integer
default value: '600000 '
exclusiveMinimum: true
minimum: 0
|
exclude_paths
Exclusive regexes
|
Regular expressions for URI patterns to exclude. This will limit this datasource to only URIs that do not match the regular expression.
type: array of string
|
include_extensions
Included file extensions
|
List the file extensions to be fetched. Note: Files with possible matching MIME types but non-matching file extensions will be skipped. Extensions should be listed without periods, using whitespace to separate items (e.g., 'pdf zip').
type: array of string
|
include_paths
Inclusive regexes
|
Regular expressions for URI patterns to include. This will limit this datasource to only URIs that match the regular expression.
type: array of string
|
index_directories
Index directories
|
Set to true to add directories to the index as documents. If set to false, directories will not be added to the index, but they will still be traversed for documents.
type: boolean
default value: 'false '
|
max_bytes
Maximum file size (bytes)
|
Maximum size (in bytes) of documents to fetch or -1 for unlimited file size.
type: integer
default value: '10485760 '
exclusiveMinimum: false
minimum: -1
|
max_docs
Max items
|
Maximum number of documents to fetch. The default (-1) means no limit.
type: integer
default value: '-1 '
exclusiveMinimum: false
minimum: -1
|
max_threads
Fetch threads
|
The maximum number of threads to use for fetching data. Note: Each thread will create a new connection to the repository, which may make overall throughput faster, but this also requires more system resources, including CPU and memory.
type: integer
default value: '1 '
|
maximum_connections
Maximum fetch connections
|
Maximum number of concurrent connections to the filesystem. A large number of documents could cause a large number of simultaneous connections to the repository and lead to errors or degraded performance. In some cases, reducing this number may help performance issues.
type: integer
default value: '1000 '
|
password
Password
|
Password for the user.
type: string
|
url
Start link
required
|
A starting URI for this datasource. The URI must be fully-qualified, and include the protocol, host, port and path, as appropriate.
type: string
minLength: 1
pattern: .*:.*
|
username
Username
|
Username with permissions to access the repository, if necessary.
type: string
|
verify_access
Validate access
|
Set to true to require successful connection to the filesystem before saving this datasource.
type: boolean
default value: 'true '
|