chunkSize - integer
The number of items to batch for each round of fetching. A higher value can make crawling faster, but memory usage is also increased. The default is 1.
Default: 1
commitAfterItems - integer
Commit the crawlDB to disk after this many items have been received. A smaller number here will result in a slower crawl because of commits to disk being more frequent; conversely, a larger number here will cause a resumed job after a crash to need to recrawl more records.
Default: 10000
crawlDBType - string
The type of crawl database to use, in-memory or on-disk.
Default: on-disk
Allowed values: on-diskin-memory
db - Connector DB
Type and properties for a ConnectorDB implementation to use with this datasource.
aliases - boolean
Keep track of original URI-s that resolved to the current URI. This negatively impacts performance and size of DB.
Default: false
inlinks - boolean
Keep track of incoming links. This negatively impacts performance and size of DB.
Default: false
inv_aliases - boolean
Keep track of target URI-s that the current URI resolves to. This negatively impacts performance and size of DB.
Default: false
type - string
Fully qualified class name of ConnectorDb implementation.
>= 1 characters
Default: com.lucidworks.connectors.db.impl.MapDbConnectorDb
dedupe - boolean
If true, documents will be deduplicated. Deduplication can be done based on an analysis of the content, on the content of a specific field, or by a JavaScript function. If neither a field nor a script are defined, content analysis will be used.
Default: false
dedupeField - string
Field to be used for dedupe. Define either a field or a dedupe script, otherwise the full raw content of each document will be used.
dedupeSaveSignature - boolean
If true,the signature used for dedupe will be stored in a 'dedupeSignature_s' field. Note this may cause errors about 'immense terms' in that field.
Default: false
dedupeScript - string
Custom javascript to dedupe documents. The script must define a 'genSignature(content){}' function, but can use any combination of document fields. The function must return a string.
delete - boolean
Set to true to remove documents from the index when they can no longer be accessed as unique documents.
Default: true
deleteErrorsAfter - integer
Number of fetch failures to tolerate before removing a document from the index. The default of -1 means no fetch failures will be removed.
Default: -1
depth - integer
Number of levels in a directory or site tree to descend for documents.
Default: -1
emitThreads - integer
The number of threads used to send documents from the connector to the index pipeline. The default is 5.
Default: 5
f.acl_collection - string
The name of the collection that we will index the ACL's to.
Default: acl
f.acl_commit_after - integer
The ACL collection's auto commit value (-1 for never auto commit)
>= -1
exclusiveMinimum: false
Default: 30000
f.active_directory - boolean
Whether this LDAP server is an Active Directory server.
Default: true
f.adNetbiosDomain - string
Use this Active Directory netbios domain. If not specified, the 'CN=Partitions,CN=Configuration,{dn}' will be used to dynamically look up the netbios domain. Only application when activeDirectory = true.
f.additional_attributes - array[string]
Additional attributes to index from LDAP
f.azure_client_id - string
If crawling Azure AD instances, supply the Azure Application's Client ID. This is required when listing Azure Groups from Microsoft Graph API.
f.azure_client_secret - string
If crawling Azure AD instances, supply the Azure Application's Client secret. This is required when listing Azure Groups from Microsoft Graph API.
f.azure_tenant_id - string
If crawling Azure AD instances, supply Azure Tenant ID. This is required when listing Azure Groups from Microsoft Graph API.
f.do_not_follow_referrals - boolean
Do not follow referrals
Default: false
f.full_crawl_page_size - integer
How many records should be fetched per page during full crawls.
>= 1
<= 500
exclusiveMinimum: false
exclusiveMaximum: false
Default: 500
f.group_members_attributes - array[string]
If you have an attribute in LDAP that stores a 'members' attribute that lists a Group's members, you can specify it here and it will be used during ACL graph traversal. (Note: Active directory 'members' attribute is automatically fetched, and you do not need to list it here).
f.incremental_crawl_page_size - integer
How many records should be fetched per page during incremental crawls.
>= 1
<= 500
exclusiveMinimum: false
exclusiveMaximum: false
Default: 500
f.index_samaccountname - boolean
Creates a user document for the netbios\sAMAccountName of a user. Required for trimming datasources like SharePoint on premise that do security filtering on users of the 'domain\username' format.
Default: true
f.index_upn - boolean
Creates a user document for the userPrincipalName of a user. Required for trimming datasources like SharePoint Online that do security filtering based on the users of the 'username@fqdn.com' format.
Default: true
f.ldap_group_base - string
If you do not want to use the base DN for group group searches you can specify a specific base DN for group group searches here.
f.ldap_group_filter - string
The ldap group filter to use when fetching all groups on ldap.
Default: (&(objectclass=group))
f.ldap_search_base - string
Base DN to connect to for Active Directory.
f.ldap_user_base - string
If you do not want to use the base DN for user group searches you can specify a specific base DN for user group searches here.
f.ldap_user_filter - string
The ldap user filter to use when fetching all users on ldap.
Default: (&(objectclass=user)(sAMAccountName=*))
f.max_job_run_time_ms - integer
Maximum time the ldap job can run in milliseconds. -1 for unlimited.
Default: -1
f.memberof_attributes - array[string]
If you have an attribute in LDAP is a 'memberOf' attribute that lists an LDAP User or Group's group memberships, list it here and it will be used during ACL graph traversal. (Note: Active directory 'memberOf' attribute is automatically fetched, and you do not need to list it here).
f.password - string
The Active Directory account password.
f.retry_attempts - integer
If Retry requests is enabled, this number determines how many times the connector should retry a request before giving up. Setting this to 1 means failing requests will not be retried.
>= 1
exclusiveMinimum: false
Default: 1
f.userPrincipal - string
The Active Directory account user principal.
f.user_id_attribute - string
Applicable only to OpenLDAP, this specifies the attribute that will be used to get the User ID. CN will be used by default if not specified.
failFastOnStartLinkFailure - boolean
If true, when Fusion cannot connect to any of the provided start links, the crawl is stopped and an exception logged.
Default: true
fetchDelayMS - integer
Number of milliseconds to wait between fetch requests. The default is 0. This property can be used to throttle a crawl if necessary.
Default: 0
fetchThreads - integer
The number of threads to use during fetching. The default is 5.
Default: 5
forceRefresh - boolean
Set to true to recrawl all items even if they have not changed since the last crawl.
Default: false
forceRefreshClearSignatures - boolean
If true, signatures will be cleared if force recrawl is enabled.
Default: true
indexCrawlDBToSolr - boolean
EXPERIMENTAL: Set to true to index the crawl-database into a 'crawldb_<datasource-ID>' collection in Solr.
Default: false
initial_mapping - Initial field mapping
Provides mapping of fields before documents are sent to an index pipeline.
condition - string
Define a conditional script that must result in true or false. This can be used to determine if the stage should process or not.
label - string
A unique label for this stage.
<= 255 characters
mappings - array[object]
List of mapping rules
Default: {"operation":"move","source":"charSet","target":"charSet_s"}{"operation":"move","source":"fetchedDate","target":"fetchedDate_dt"}{"operation":"move","source":"lastModified","target":"lastModified_dt"}{"operation":"move","source":"signature","target":"dedupeSignature_s"}{"operation":"move","source":"length","target":"length_l"}{"operation":"move","source":"mimeType","target":"mimeType_s"}{"operation":"move","source":"parent","target":"parent_s"}{"operation":"move","source":"owner","target":"owner_s"}{"operation":"move","source":"group","target":"group_s"}
object attributes:{operation
: {
display name: Operation
type: string
}source
required : {
display name: Source Field
type: string
}target
: {
display name: Target Field
type: string
}}
reservedFieldsMappingAllowed - boolean
Default: false
skip - boolean
Set to true to skip this stage.
Default: false
unmapped - Unmapped Fields
If fields do not match any of the field mapping rules, these rules will apply.
operation - string
The type of mapping to perform: move, copy, delete, add, set, or keep.
Default: copy
Allowed values: copymovedeletesetaddkeep
source - string
The name of the field to be mapped.
target - string
The name of the field to be mapped to.
maxItems - integer
Maximum number of documents to fetch. The default (-1) means no limit.
Default: -1
refreshAll - boolean
Set to true to always recrawl all items found in the crawldb.
Default: true
refreshErrors - boolean
Set to true to recrawl items that failed during the last crawl.
Default: false
refreshIDPrefixes - array[string]
A prefix to recrawl all items whose IDs begin with this value.
refreshIDRegexes - array[string]
A regular expression to recrawl all items whose IDs match this pattern.
refreshOlderThan - integer
Number of seconds to recrawl items whose last fetched date is longer ago than this value.
Default: -1
refreshScript - string
A JavaScript function ('shouldRefresh()') to customize the items recrawled.
refreshStartLinks - boolean
Set to true to recrawl items specified in the list of start links.
Default: false
retryEmit - boolean
Set to true for emit batch failures to be retried on a document-by-document basis.
Default: true
startLinks - array[string]
One or more starting URIs for this datasource.