add_failed_docs
Add failed documents
|
Set to true to add documents even if they partially fail processing. Failed documents will be added with as much metadata as available, but may not include many expected fields.
type: boolean
default value: 'false '
|
bounds
Crawl bounds
|
Limits the the crawl to a specific directory sub-tree, hostname or domain.
type: string
default value: 'tree '
enum: {
tree
host
domain
none
}
|
collection
Collection
|
Collection documents will be indexed to.
type: string
|
commit_on_finish
Solr commit on finish
|
Set to true for a request to be sent to Solr after the last batch has been fetched to commit the documents to the index.
type: boolean
default value: 'true '
|
crawl_depth
Max crawl depth
|
Number of levels in a directory or site tree to descend for documents.
type: integer
default value: '-1 '
exclusiveMinimum: false
minimum: -1
|
crawl_item_timeout
Fetch timeout
|
Time in milliseconds to fetch any individual document.
type: integer
default value: '600000 '
exclusiveMinimum: true
minimum: 0
|
exclude_paths
Exclusive regexes
|
Regular expressions for URI patterns to exclude. This will limit this datasource to only URIs that do not match the regular expression.
type: array of string
default value: [
]
|
include_extensions
Included file extensions
|
File extensions to be fetched. This will limit this datasource to only these file extensions. However, files with possible matching MIME types but non-matching file extensions will be skipped. The file extensions should be entered with no dots, with whitespace separating items in a list (e.g., 'pdf zip').
type: array of string
|
include_paths
Inclusive regexes
|
Regular expressions for URI patterns to include. This will limit this datasource to only URIs that match the regular expression.
type: array of string
|
index_directories
Index directories
|
Set to true to add directories to the index as documents. If set to false, directories will not be added to the index, but they will still be traversed for documents.
type: boolean
default value: 'false '
|
max_bytes
Maximum file size (bytes)
|
Maximum size, in bytes, of documents to fetch.
type: integer
default value: '10485760 '
exclusiveMinimum: false
minimum: -1
|
max_docs
Max items
|
Maximum number of documents to fetch. The default (-1) means no limit.
type: integer
default value: '-1 '
exclusiveMinimum: false
minimum: -1
|
max_threads
Fetch threads
|
The maximum number of threads to use for fetching data. Each thread will create a new connection to the repository, which may make overall throughput faster, but will also require more system resources including CPU and memory.
type: integer
default value: '1 '
|
maximum_connections
Maximum fetch connections
|
Maximum number of concurrent connections to the filesystem. A large number of documents could cause a large number of simultaneous connections to the repository and lead to errors or degraded performance. In some cases, reducing this number may help performance issues.
type: integer
default value: '1000 '
|
password
AWS Secret Key
required
|
The AWS Secret Key associated with the Access Key.
type: string
|
splitter
Split CSV/TSV and archives
|
Set to true to split CSV/TSV files into a document per row, or to split archives such as .zip, tar.gz, etc., into individual files.
type: object
object attributes: {
}
|
url
S3 Hadoop URL
required
|
A fully-qualified S3N URL, format: s3://{bucketName}/{path} .
type: string
minLength: 1
pattern: .*:.*
|
username
AWS Key
required
|
An AWS Access Key ID that can access the content.
type: string
|
verify_access
Validate access
|
Set to true to require successful connection to the filesystem before saving this datasource.
type: boolean
default value: 'true '
|