add_failed_docs
Add failed documents
|
Add document even if it partially failed processing.
type: boolean
default value: 'false '
|
bounds
Traversal Boundaries
|
Traversal limits relative to the starting point. Default is 'tree'.
type: string
default value: 'tree '
enum: {
tree
host
domain
none
}
|
collection
Collection
|
Collection Name
type: string
|
commit_on_finish
Commit on Finish?
|
Issue a commit command when job is finished. Default is true.
type: boolean
default value: 'true '
|
converter
Converter
|
Converter for Writable values extracted from Hadoop sequence / map files.
type: string
|
crawl_depth
Traversal Depth
|
Depth of traversal from the starting point. Default is -1 (unlimited).
type: integer
default value: '-1 '
exclusiveMinimum: false
minimum: -1
|
crawl_item_timeout
Item fetch timeout
|
Maximum time in [ms] to fetch any individual item.
type: integer
default value: '600000 '
exclusiveMinimum: true
minimum: 0
|
exclude_paths
Exclude patterns
|
Resource URI-s that match one or more of the exclude patterns will be skipped.
type: array of string
default value: [
]
|
include_extensions
File extensions
|
Process only files with these extensions.
type: array of string
|
include_paths
Include patterns
|
Only resource URI-s that match one or more of the include patterns will be processed.
type: array of string
|
index_directories
Directories as documents
|
Add directory entries as separate documents.
type: boolean
default value: 'false '
|
max_bytes
Maximum bytes
|
Maximum bytes to process for each document. Longer documents will be truncated. -1 means unlimited, and may lead to out of memory errors.
type: integer
default value: '10485760 '
exclusiveMinimum: false
minimum: -1
|
max_docs
Maximum documents
|
Maximum number of document to process in a job. -1 means unlimited.
type: integer
default value: '-1 '
exclusiveMinimum: false
minimum: -1
|
max_threads
Maximum threads
|
Maximum number of threads to use for fetching documents.
type: integer
default value: '1 '
|
maximum_connections
Maximum connections
|
Maximum number of concurrent connections to the file system.
type: integer
default value: '1000 '
|
splitter
Container splitter
|
Settings for the optional large container files splitter (archives, CSV/TSV).
type: object
object attributes: {
}
|
url
File system URI
required
|
Fully-qualified URI of the file system path
type: string
minLength: 1
pattern: .*:.*
|
verify_access
Verify Access?
|
Verify that the target system is accessible using this data source configuration.
type: boolean
default value: 'true '
|