Local Chunker Index StageIndex pipeline stage configuration specifications
This feature is only available in Fusion 5.9.x for versions 5.9.14 and later. |
The Local Chunker indexing stage uses your local Ray deployment or your API to break down large text documents into smaller, semantically meaningful chunks, vectorizes those chunks for Neural Hybrid Search, and stores those vectors in Solr.
Use this index stage if:
-
You want to use chunking in your Fusion search strategy with an external chunking solution.
-
You are comfortable setting up your own Ray Serve environment or using Fusion’s Ray image.
-
You cannot use the LWAI Chunker Index Stage, which uses Lucidworks AI to break down large text documents.
You must set up the Local Chunker stage with Fusion’s Ray image, your own Ray Serve environment, or an API. See Develop and deploy a chunking machine learning model with Ray for a tutorial for your own Ray Serve environment. If you are using an API instead of a Ray model deployment to do chunking, the minimum requirement is that the response matches what the Local Chunker Stage input requires.
Accepted format
The Local Chunker stage accepts data formated in a specific format:
{
"chunkedData": "{\"spans\": [[0, 13]], \"vectors\": [{\"vector\": [171, 167, 127, 148, 141...]}], \"chunks\": [\"I love Fusion\"]}"
}
You can obtain this format by converting a response such as the following to a single JSON string and use that converted response as the value to `chunkedData’s key:
{
"spans": [
[ 0,13]
],
"vectors": [
{"vector": [171, 167, 127, 148, 141...]}
],
"chunks": [
"I love Fusion"
]
}
In the Develop and deploy a chunking machine learning model with Ray tutorial, the key to use as the Input Context Variable is response
.
To use the local chunker stage, add the following to your solrConfig.xml
file:
<!-- FUSION NOTES: These query parsers are used with Solr-based vector search -->
<queryParser name="xvecSim" class="org.apache.solr.lwbackported.XVecSimQParserPlugin"/>
<queryParser name="_lw_chunk_wrap" class="org.apache.solr.lw.ParentAndAllKidsWrapperQParserPlugin"/>
<queryParser name="neuralHybrid" class="org.apache.solr.lw.NeuralHybridQParserPlugin"/>
Vector definitions
Add the following vector definitions to your managed-schema.xml
file:
<!-- Vector search fields -->
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_64v" stored="true" type="knn_64_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_128v" stored="true" type="knn_128_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_256v" stored="true" type="knn_256_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_384v" stored="true" type="knn_384_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_512v" stored="true" type="knn_512_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_768v" stored="true" type="knn_768_vector"/>
<dynamicField docValues="false" indexed="true" multiValued="false" name="*_1024v" stored="true" type="knn_1024_vector"/>
<!-- Field Types to support vector search -->
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_64_vector" similarityFunction="cosine" vectorDimension="64"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_128_vector" similarityFunction="cosine" vectorDimension="128"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_256_vector" similarityFunction="cosine" vectorDimension="256"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_384_vector" similarityFunction="cosine" vectorDimension="384"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_512_vector" similarityFunction="cosine" vectorDimension="512"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_768_vector" similarityFunction="cosine" vectorDimension="768"/>
<fieldType class="solr.DenseVectorField" hnswBeamWidth="200" hnswMaxConnections="45" knnAlgorithm="hnsw" name="knn_1024_vector" similarityFunction="cosine" vectorDimension="1024"/>
<!-- INCOMPATIBLE with versions before 5.9.13 -->
<!-- BYTE Vector search fields -->
<dynamicField name="*_64bv" type="knn_64_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_128bv" type="knn_128_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_256bv" type="knn_256_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_384bv" type="knn_384_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_512bv" type="knn_512_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_768bv" type="knn_768_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<dynamicField name="*_1024bv" type="knn_1024_byte_vector" indexed="true" stored="true" docValues="false" multiValued="false"/>
<!-- BYTE Field Types to support vector search -->
<fieldType name="knn_64_byte_vector" class="solr.DenseVectorField" vectorDimension="64" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_128_byte_vector" class="solr.DenseVectorField" vectorDimension="128" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_256_byte_vector" class="solr.DenseVectorField" vectorDimension="256" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_384_byte_vector" class="solr.DenseVectorField" vectorDimension="384" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_512_byte_vector" class="solr.DenseVectorField" vectorDimension="512" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_768_byte_vector" class="solr.DenseVectorField" vectorDimension="768" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
<fieldType name="knn_1024_byte_vector" class="solr.DenseVectorField" vectorDimension="1024" similarityFunction="cosine" vectorEncoding="BYTE" knnAlgorithm="hnsw" hnswMaxConnections="45" hnswBeamWidth="200"/>
JavaScript to transform JSON to a string
If you are returning JSON from the API, use the following code sample to pass the JSON response to the Local Chunker stage.
var api_json_response = ctx.get("api_response")["response"]
var stringified_api_response = JSON.stringify(api_json_response)
ctx.put(“chunkedData”, stringified_api_response)
Configuration
When entering configuration values in the UI, use unescaped characters, such as \t for the tab character. When entering configuration values in the API, use escaped characters, such as \\t for the tab character.
|