Local Chunker Index StageIndex pipeline stage configuration specifications

Table of Contents

Accepted format
Vector definitions
JavaScript to transform JSON to a string
Configuration

This feature is only available in Fusion 5.9.x for versions 5.9.14 and later.

The Local Chunker indexing stage uses your local Ray deployment or your API to break down large text documents into smaller, semantically meaningful chunks, vectorizes those chunks for Neural Hybrid Search, and stores those vectors in Solr.

Use this index stage if:

You want to use chunking in your Fusion search strategy with an external chunking solution.
You are comfortable setting up your own Ray Serve environment or using Fusion’s Ray image.
You cannot use the LWAI Chunker Index Stage, which uses Lucidworks AI to break down large text documents.

You must set up the Local Chunker stage with Fusion’s Ray image, your own Ray Serve environment, or an API. See Develop and deploy a chunking machine learning model with Ray for a tutorial for your own Ray Serve environment. If you are using an API instead of a Ray model deployment to do chunking, the minimum requirement is that the response matches what the Local Chunker Stage input requires.

Accepted format

The Local Chunker stage accepts data formated in a specific format:

{
    "chunkedData": "{\"spans\": [[0, 13]], \"vectors\": [{\"vector\": [171, 167, 127, 148, 141...]}], \"chunks\": [\"I love Fusion\"]}"
}

You can obtain this format by converting a response such as the following to a single JSON string and use that converted response as the value to `chunkedData’s key:

{
"spans": [
        [ 0,13]
    ],
"vectors": [
        {"vector": [171, 167, 127, 148, 141...]}
    ],
"chunks": [
  "I love Fusion"
  ]
}

In the Develop and deploy a chunking machine learning model with Ray tutorial, the key to use as the Input Context Variable is response.

To use the local chunker stage, add the following to your solrConfig.xml file:

<!-- FUSION NOTES: These query parsers are used with Solr-based vector search -->
<queryParser name="xvecSim" class="org.apache.solr.lwbackported.XVecSimQParserPlugin"/>
<queryParser name="_lw_chunk_wrap" class="org.apache.solr.lw.ParentAndAllKidsWrapperQParserPlugin"/>
<queryParser name="neuralHybrid" class="org.apache.solr.lw.NeuralHybridQParserPlugin"/>

Vector definitions

Add the following vector definitions to your managed-schema.xml file:

<!-- Vector search fields -->
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_64v" stored="true" type="knn_64_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_128v" stored="true" type="knn_128_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_256v" stored="true" type="knn_256_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_384v" stored="true" type="knn_384_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_512v" stored="true" type="knn_512_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_768v" stored="true" type="knn_768_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_1024v" stored="true" type="knn_1024_vector"/>

<!-- Field Types to support vector search -->
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_64_vector" similarityFunction="cosine" vectorDimension="64"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_128_vector" similarityFunction="cosine" vectorDimension="128"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_256_vector" similarityFunction="cosine" vectorDimension="256"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_384_vector" similarityFunction="cosine" vectorDimension="384"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_512_vector" similarityFunction="cosine" vectorDimension="512"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_768_vector" similarityFunction="cosine" vectorDimension="768"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_1024_vector" similarityFunction="cosine" vectorDimension="1024"/>

<!-- INCOMPATIBLE with versions before 5.9.13 -->
<!-- BYTE Vector search fields -->
<dynamicField name="*_64bv" type="knn_64_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_128bv" type="knn_128_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_256bv" type="knn_256_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_384bv" type="knn_384_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_512bv" type="knn_512_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_768bv" type="knn_768_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_1024bv" type="knn_1024_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<!-- BYTE Field Types to support vector search -->
<fieldType name="knn_64_byte_vector" class="solr.DenseVectorField" vectorDimension="64" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_128_byte_vector" class="solr.DenseVectorField" vectorDimension="128" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_256_byte_vector" class="solr.DenseVectorField" vectorDimension="256" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_384_byte_vector" class="solr.DenseVectorField" vectorDimension="384" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_512_byte_vector" class="solr.DenseVectorField" vectorDimension="512" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_768_byte_vector" class="solr.DenseVectorField" vectorDimension="768" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_1024_byte_vector" class="solr.DenseVectorField" vectorDimension="1024" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>

JavaScript to transform JSON to a string

If you are returning JSON from the API, use the following code sample to pass the JSON response to the Local Chunker stage.

var api_json_response = ctx.get("api_response")["response"]
var stringified_api_response = JSON.stringify(api_json_response)
ctx.put(“chunkedData”, stringified_api_response)

Configuration

When entering configuration values in the UI, use unescaped characters, such as \t for the tab character. When entering configuration values in the API, use escaped characters, such as \\t for the tab character.

Local Chunker Index StageIndex pipeline stage configuration specifications

Accepted format

Vector definitions

JavaScript to transform JSON to a string

Configuration

skip - boolean

label - string

condition - string

inputContextVariable - stringrequired

outputContextVariable - stringrequired

outputChunkSpans - string

outputTextChunks - string

failOnError - boolean