Index Pipelines API

The Index Pipelines API provides methods for managing a set of named index pipelines. Every pipeline is made up of one or more stages. Stages can be defined during the creation of a pipeline, or stages can be defined separately and included into one or more pipelines. For details of the REST API for index stages, see Index Stages API.

Document processing proceeds stage by stage in a linear fashion. The order of the stages in a pipeline is the order in which they were defined. At installation, Fusion includes several pre-configured pipelines. See Index Pipelines for details on these default pipelines.

For more information about structuring documents for indexing, see Pushing Documents to a Pipeline.

Examples

List the 'default' pipeline: REQUEST

curl -u user:pass http://localhost:8764/api/apollo/index-pipelines/default

RESPONSE

{
  "id" : "default",
  "stages" : [ {
    "type" : "solr-index",
    "id" : "solr-default",
    "skip" : false
  } ]
}

Create an index pipeline named 'my-index-pipeline' with three stages, one of which does not yet exist:

REQUEST

curl -u user:pass -X POST -H 'Content-type: application/json' -d '{"id":"my-index-pipeline","stages":[{"id":"tika","type":"tika-parser","includeImages":true},{"id":"conn_mapping","type":"ref"},{"id":"solr-default","type":"ref"}]}' http://localhost:8764/api/apollo/index-pipelines

RESPONSE

{
  "id" : "my-index-pipeline",
  "stages" : [ {
    "type" : "tika-parser",
    "id" : "tika",
    "includeImages" : true,
    "flattenCompound" : false,
    "addFailedDocs" : false,
    "addOriginalContent" : true,
    "contentField" : "_raw_content_",
    "skip" : false,
    "label" : "tika-parser"
  }, {
    "type" : "ref",
    "id" : "conn_mapping",
    "skip" : false,
    "label" : "ref"
  }, {
    "type" : "ref",
    "id" : "solr-default",
    "skip" : false,
    "label" : "ref"
  } ]
}

Reload the 'my-index-pipeline' pipeline:

INPUT

curl -u user:pass -X PUT http://localhost:8764/api/apollo/index-pipelines/my-index-pipeline/refresh

Index two JSON documents through a pipeline named 'conn_solr' and a collection named 'my-docs':

INPUT

curl -u user:pass -X POST -H "Content-Type: application/vnd.lucidworks-document" -d '[{"id": "myDoc1","fields": [{"name":"title", "value": "My first document"},{"name":"body", "value": "This is a simple document."}]}, {"id": "myDoc2","fields": [{"name":"title","value": "My second document"},{"name":"body", "value": "This is another simple document."}]}]' http://localhost:8764/api/apollo/index-pipelines/conn_solr/collections/my-docs/index

OUTPUT

[ {
  "id" : "myDoc1",
  "fields" : [ {
    "name" : "content",
    "value" : "This is a simple document.",
    "metadata" : { },
    "annotations" : [ ]
  }, {
    "name" : "title",
    "value" : "My first document",
    "metadata" : { },
    "annotations" : [ ]
  }, {
    "name" : "parsing_s",
    "value" : "no_raw_data",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "parsing_time_l",
    "value" : [ "java.lang.Long", 7 ],
    "metadata" : { },
    "annotations" : [ ]
  } ],
  "metadata" : { },
  "commands" : [ ]
}, {
  "id" : "myDoc2",
  "fields" : [ {
    "name" : "content",
    "value" : "This is another simple document.",
    "metadata" : { },
    "annotations" : [ ]
  }, {
    "name" : "title",
    "value" : "My second document",
    "metadata" : { },
    "annotations" : [ ]
  }, {
    "name" : "parsing_s",
    "value" : "no_raw_data",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "parsing_time_l",
    "value" : [ "java.lang.Long", 0 ],
    "metadata" : { },
    "annotations" : [ ]
  } ],
  "metadata" : { },
  "commands" : [ ]
} ]

Index a PDF document with the 'conn_solr' pipeline:

INPUT

curl -u user:pass -X POST -H "Content-Type: application/pdf" --data-binary @/solr/core/src/test-files/mailing_lists.pdf http://localhost:8764/api/apollo/index-pipelines/conn_solr/collections/my-docs/index

OUTPUT

[ {
  "id" : "d6c7757e-33d9-4fbb-aa38-eef84d679ca9",
  "fields" : [ {
    "name" : "fileSize_l",
    "value" : "8582",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "parsing_s",
    "value" : "no_raw_data",
    "metadata" : { },
    "annotations" : [ ]
  }, {
    "name" : "pageCount_i",
    "value" : "2",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "parsing_time_l",
    "value" : [ "java.lang.Long", 1171 ],
    "metadata" : { },
    "annotations" : [ ]
  }, {
    "name" : "parsing_time_l",
    "value" : [ "java.lang.Long", 0 ],
    "metadata" : { },
    "annotations" : [ ]
  }, {
    "name" : "attr_pdf:encrypted_",
    "value" : "false",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "attr_X-Parsed-By_",
    "value" : "org.apache.tika.parser.pdf.PDFParser",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "attr_pdf:PDFVersion_",
    "value" : "1.3",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "attr_producer_",
    "value" : "FOP 0.20.5",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "content",
    "value" : "\nSolr Mailing Lists\n\nTable of contents\n1 ",
    "metadata" : { },
    "annotations" : [ ]
  }, {
    "name" : "attr_dc:format_",
    "value" : "application/pdf; version=1.3",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  }, {
    "name" : "mimeType_s",
    "value" : "application/pdf",
    "metadata" : {
      "creator" : "tika-parser"
    },
    "annotations" : [ ]
  } ],
  "metadata" : { },
  "commands" : [ ]
} ]

Index a JSON document though the 'conn_solr' pipeline into a collection called 'docs', using the "command" option:

INPUT

curl -u user:pass -X POST -H "Content-Type: application/vnd.lucidworks-document" -d '[{"id": "myDoc2","commands": [{"name":"delete","value": "myDoc2"}]},{"id": "myDoc1","commands": [{"name":"delete","value": "myDoc1"},{"name":"commit","value": "true"}]}]' http://localhost:8764/api/apollo/index-pipelines/conn_solr/collections/docs/index

OUTPUT

[ {
  "id" : "myDoc2",
  "fields" : [ ],
  "commands" : [ {
    "name" : "delete",
    "params" : { }
  } ]
}, {
  "id" : "myDoc1",
  "fields" : [ ],
  "commands" : [ {
    "name" : "delete",
    "params" : { }
  }, {
    "name" : "commit",
    "params" : { }
  } ]
} ]

Index two simple documents through a pipeline named 'conn_solr' and a collection named 'my-docs' and get a detailed output of the pipeline process:

INPUT

curl -u user:pass -X POST -H "Content-Type: application/vnd.lucidworks-document" -d '[{"id": "myDoc1","fields": [{"name":"title", "value": "My first document"},{"name":"body", "value": "This is a simple document."}]}, {"id": "myDoc2","fields": [{"name":"title","value": "My second document"},{"name":"body", "value": "This is another simple document."}]}]' http://localhost:8764/api/apollo/index-pipelines/conn_solr/collections/my-docs/debug

OUTPUT

The output will include how each document passed through each stage. (In the example output below, we have truncated the 'field-mapping' stage for space.)

 {
  "stages" : [ {
    "type" : "tika-parser",
    "id" : "conn_tika",
    "includeImages" : true,
    "flattenCompound" : false,
    "addFailedDocs" : true,
    "addOriginalContent" : true,
    "contentField" : "_raw_content_",
    "skip" : false
  }, {
    "type" : "field-mapping",
    "id" : "conn_mapping",
    "mappings" : [
...
],
    "unmapped" : {
      "source" : "/(.*)/",
      "target" : "attr_$1_",
      "operation" : "move"
    },
    "skip" : false
  }, {
    "type" : "multivalue-resolver",
    "id" : "conn_multivalue_resolver",
    "typeStrategy" : [ {
      "fieldName" : "string",
      "resolverStrategy" : "pick_last"
    } ],
    "skip" : false
  }, {
    "type" : "solr-index",
    "id" : "conn_solr",
    "enforceSchema" : true,
    "skip" : false
  } ],
  "output" : [ {
    "stageType" : "tika-parser",
    "stageId" : "conn_tika",
    "context" : {
      "simulate" : false,
      "stageIndex" : 0,
      "collection" : "docs",
      "async" : false
    },
    "docs" : [ {
      "id" : "6b5c10f1-d941-41a6-957f-f677f5ad0fd5",
      "fields" : [ {
        "name" : "attr_id_",
        "value" : "myDoc1",
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_time_l",
        "value" : [ "java.lang.Long", 0 ],
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_s",
        "value" : "no_raw_data",
        "metadata" : {
          "creator" : "tika-parser"
        },
        "annotations" : [ ]
      }, {
        "name" : "attr_fields_",
        "value" : [ "java.util.ArrayList", [ {
          "name" : "title",
          "value" : "My first document"
        }, {
          "name" : "body",
          "value" : "This is a simple document."
        } ] ],
        "metadata" : { },
        "annotations" : [ ]
      } ],
      "metadata" : { },
      "commands" : [ ]
    }, {
      "id" : "4dac3c4e-d7f5-4cbd-96dc-e2eae69711e3",
      "fields" : [ {
        "name" : "attr_id_",
        "value" : "myDoc2",
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_time_l",
        "value" : [ "java.lang.Long", 0 ],
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_s",
        "value" : "no_raw_data",
        "metadata" : {
          "creator" : "tika-parser"
        },
        "annotations" : [ ]
      }, {
        "name" : "attr_fields_",
        "value" : [ "java.util.ArrayList", [ {
          "name" : "title",
          "value" : "My second document"
        }, {
          "name" : "body",
          "value" : "This is another simple document."
        } ] ],
        "metadata" : { },
        "annotations" : [ ]
      } ],
      "metadata" : { },
      "commands" : [ ]
    } ]
  }, {
    "stageType" : "field-mapping",
    "stageId" : "conn_mapping",
    "context" : {
      "simulate" : false,
      "stageIndex" : 1,
      "collection" : "docs",
      "async" : false
    },
    "docs" : [ {
      "id" : "6b5c10f1-d941-41a6-957f-f677f5ad0fd5",
      "fields" : [ {
        "name" : "attr_id_",
        "value" : "myDoc1",
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_s",
        "value" : "no_raw_data",
        "metadata" : {
          "creator" : "tika-parser"
        },
        "annotations" : [ ]
      }, {
        "name" : "parsing_time_l",
        "value" : [ "java.lang.Long", 0 ],
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "attr_fields_",
        "value" : [ "java.util.ArrayList", [ {
          "name" : "title",
          "value" : "My first document"
        }, {
          "name" : "body",
          "value" : "This is a simple document."
        } ] ],
        "metadata" : { },
        "annotations" : [ ]
      } ],
      "metadata" : { },
      "commands" : [ ]
    }, {
      "id" : "4dac3c4e-d7f5-4cbd-96dc-e2eae69711e3",
      "fields" : [ {
        "name" : "attr_id_",
        "value" : "myDoc2",
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_s",
        "value" : "no_raw_data",
        "metadata" : {
          "creator" : "tika-parser"
        },
        "annotations" : [ ]
      }, {
        "name" : "parsing_time_l",
        "value" : [ "java.lang.Long", 0 ],
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "attr_fields_",
        "value" : [ "java.util.ArrayList", [ {
          "name" : "title",
          "value" : "My second document"
        }, {
          "name" : "body",
          "value" : "This is another simple document."
        } ] ],
        "metadata" : { },
        "annotations" : [ ]
      } ],
      "metadata" : { },
      "commands" : [ ]
    } ]
  }, {
    "stageType" : "multivalue-resolver",
    "stageId" : "conn_multivalue_resolver",
    "context" : {
      "simulate" : false,
      "stageIndex" : 2,
      "collection" : "docs",
      "async" : false
    },
    "docs" : [ {
      "id" : "6b5c10f1-d941-41a6-957f-f677f5ad0fd5",
      "fields" : [ {
        "name" : "attr_id_",
        "value" : "myDoc1",
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_s",
        "value" : "no_raw_data",
        "metadata" : {
          "creator" : "tika-parser"
        },
        "annotations" : [ ]
      }, {
        "name" : "parsing_time_l",
        "value" : [ "java.lang.Long", 0 ],
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "attr_fields_",
        "value" : [ "java.util.ArrayList", [ {
          "name" : "title",
          "value" : "My first document"
        }, {
          "name" : "body",
          "value" : "This is a simple document."
        } ] ],
        "metadata" : { },
        "annotations" : [ ]
      } ],
      "metadata" : { },
      "commands" : [ ]
    }, {
      "id" : "4dac3c4e-d7f5-4cbd-96dc-e2eae69711e3",
      "fields" : [ {
        "name" : "attr_id_",
        "value" : "myDoc2",
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_s",
        "value" : "no_raw_data",
        "metadata" : {
          "creator" : "tika-parser"
        },
        "annotations" : [ ]
      }, {
        "name" : "parsing_time_l",
        "value" : [ "java.lang.Long", 0 ],
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "attr_fields_",
        "value" : [ "java.util.ArrayList", [ {
          "name" : "title",
          "value" : "My second document"
        }, {
          "name" : "body",
          "value" : "This is another simple document."
        } ] ],
        "metadata" : { },
        "annotations" : [ ]
      } ],
      "metadata" : { },
      "commands" : [ ]
    } ]
  }, {
    "stageType" : "solr-index",
    "stageId" : "conn_solr",
    "context" : {
      "simulate" : false,
      "stageIndex" : 3,
      "collection" : "docs",
      "async" : false
    },
    "docs" : [ {
      "id" : "6b5c10f1-d941-41a6-957f-f677f5ad0fd5",
      "fields" : [ {
        "name" : "attr_id_",
        "value" : "myDoc1",
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_s",
        "value" : "no_raw_data",
        "metadata" : {
          "creator" : "tika-parser"
        },
        "annotations" : [ ]
      }, {
        "name" : "parsing_time_l",
        "value" : [ "java.lang.Long", 0 ],
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "attr_fields_",
        "value" : [ "java.util.ArrayList", [ {
          "name" : "title",
          "value" : "My first document"
        }, {
          "name" : "body",
          "value" : "This is a simple document."
        } ] ],
        "metadata" : { },
        "annotations" : [ ]
      } ],
      "metadata" : { },
      "commands" : [ ]
    }, {
      "id" : "4dac3c4e-d7f5-4cbd-96dc-e2eae69711e3",
      "fields" : [ {
        "name" : "attr_id_",
        "value" : "myDoc2",
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "parsing_s",
        "value" : "no_raw_data",
        "metadata" : {
          "creator" : "tika-parser"
        },
        "annotations" : [ ]
      }, {
        "name" : "parsing_time_l",
        "value" : [ "java.lang.Long", 0 ],
        "metadata" : { },
        "annotations" : [ ]
      }, {
        "name" : "attr_fields_",
        "value" : [ "java.util.ArrayList", [ {
          "name" : "title",
          "value" : "My second document"
        }, {
          "name" : "body",
          "value" : "This is another simple document."
        } ] ],
        "metadata" : { },
        "annotations" : [ ]
      } ],
      "metadata" : { },
      "commands" : [ ]
    } ]
  } ]
}