> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Custom configuration

> Lucidworks AI custom embedding model training

export const schema = {
  "additionalProperties": true,
  "type": "object",
  "category": "Other",
  "categoryPriority": 1,
  "title": "Lucidworks AI Custom Embedding Configuration",
  "description": "The configuration parameters for training a Custom Embedding Model through Lucidworks AI.",
  "required": ["dataset_config", "trainer_config"],
  "properties": {
    "dataset_config": {
      "additionalProperties": true,
      "config": "dataset_config",
      "description": "This field is a parent config that sets defaults for what can be used for training and evaluation and dataset specific parameters: where it's located, fields that should be used, monitor metric, etc.",
      "minLength": 1,
      "title": "dataset_config",
      "type": "string",
      "default": "eCommerce='mlp_ecommerce_rnn', general='mlp_ecommerce_rnn'",
      "enum": ["mlp_ecommerce_rnn", "mlp_general_rnn"],
      "properties": {
        "dataset_config.pkid_col_name": {
          "description": "This field allows the pkid (primary key ID) column to be mapped to another column name if `pkid` is not present in the columns.\nThe pkid is a unique value for each document. Entries with a duplicate pkid are filtered out. Since not every pkid entry is associated with a query, there may be entries in the catalog index file that are not associated with a query file entry. It is required if not the default",
          "title": "dataset_config.pkid_col_name",
          "type": "string",
          "hints": ["advanced"],
          "default": "pkid",
          "enum": ["any string"]
        },
        "dataset_config.index_title_col_name": {
          "description": "This field allows title to be mapped to another column name if `title` is not present in the columns.\nIf title and desc (description) are both provided in your config, they will need to be concatenated into a single text field at indexing. This is because title+desc are concatenated into a single text during model training. If only one is provided, then it doesn’t matter which field is used.",
          "title": "dataset_config.index_title_col_name",
          "type": "string",
          "hints": ["advanced"],
          "default": "eCommerce='name', general=null",
          "enum": ["any string", "null"]
        },
        "dataset_config.index_desc_col_name": {
          "description": "This field allows desc (description) to be mapped to another column name if `desc` is not present in the columns.\nIf title and desc (description) are both provided in your config, they will need to be concatenated into a single text field at indexing. This is because title+desc are concatenated into a single text during model training. If only one is provided, then it doesn’t matter which field is used.",
          "title": "dataset_config.index_desc_col_name",
          "type": "string",
          "hints": ["advanced"],
          "default": "eCommerce=null, general='text'",
          "enum": ["any string", "null"]
        },
        "dataset_config.index_body_col_name": {
          "description": "This field allows body to be mapped to another column name if `body` is not present in the columns.\nThe body field is used purely for vocabulary creation and custom token embeddings training. If there is a lengthy text field that doesn’t make sense to use for training, it still might be helpful to use it to improve vocabulary coverage and tokenization.",
          "title": "dataset_config.index_body_col_name",
          "type": "string",
          "hints": ["advanced"],
          "default": "null",
          "enum": ["any string", "null"]
        },
        "dataset_config.query_col_name": {
          "description": "This field allows query to be mapped to another column name if `query` is not present in the columns. It is required if not the default.",
          "title": "dataset_config.query_col_name",
          "type": "string",
          "hints": ["advanced"],
          "default": "query",
          "enum": ["any string", "null"]
        },
        "dataset_config.weight_col_name": {
          "description": "This field allows weight to be mapped to another column name if weight is not present in the columns. It is required if not the default. ",
          "title": "dataset_config.weight_col_name",
          "type": "string",
          "hints": ["advanced"],
          "default": "eCommerce='aggr_count', general=null",
          "enum": ["any string", "null"]
        },
        "dataset_config.metrics_config.monitor_metric": {
          "description": "This field determines the monitor metric. The main metric at k that should be monitored to decide when to stop training. Possible main metrics are: hit, map, mrr, ndcg, & recall. It’s mainly used in deciding when the early stopping should happen. Specifically, when there is no increase in the dataset_config.metrics_config.monitor_metric value for a particular number of epochs (controlled by trainer_config.monitor_patience parameter), the training stops.",
          "title": "dataset_config.metrics_config.monitor_metric",
          "type": "string",
          "hints": ["advanced"],
          "default": "eCommerce='ndcg@5', general='mrr@3'",
          "pattern": "\b(?:hit|map|mrr|ndcg|recall|f1)@(?:1|3|5|10)\b",
          "enum": ["hit@1", "hit@3", "hit@5", "hit@10", "map@1", "map@3", "map@5", "map@10", "mrr@1", "mrr@3", "mrr@5", "mrr@10", "ndcg@1", "ndcg@3", "ndcg@5", "ndcg@10", "recall@1", "recall@3", "recall@5", "recall@10", "f1@1", "f1@3", "f1@5", "f1@10"]
        }
      }
    },
    "trainer_config": {
      "additionalProperties": true,
      "config": "trainer_config",
      "description": "This field is a parent config that sets defaults for: what kind of text processing should be applied to the data, which encoder architecture to use, which loss function and its parameters to use, which optimizer and its parameters to use, which learning rate scheduler and its parameters to use, specifies metric names and range at which they should.",
      "title": "trainer_config",
      "type": "string",
      "default": "eCommerce='mlp_ecommerce_rnn', general='mlp_ecommerce_rnn'",
      "enum": ["mlp_ecommerce_rnn", "mlp_general_rnn"],
      "properties": {
        "trainer_config/text_processor_config": {
          "description": "This field determines which type of tokenization and embedding is used as the base for the recurrent neural network (RNN) model. This field only displays for custom models with a TRAINING_FAILED status. For more information, see Lucidworks AI Models API text processors. From that topic, select View API specification for detailed API information.",
          "title": "trainer_config/text_processor_config",
          "type": "string",
          "hints": ["advanced"],
          "default": "word_en",
          "enum": ["word_en", "bpe_en_small", "bpe_en_large", "bpe_multi", "bpe_bg_small", "bpe_bg_large", "bpe_de_small", "bpe_de_large", "bpe_es_small", "bpe_es_large", "bpe_fr_small", "bpe_fr_large", "bpe_it_small", "bpe_it_large", "bpe_ja_small", "bpe_ja_large", "bpe_ko_small", "bpe_ko_large", "bpe_nl_small", "bpe_nl_large", "bpe_ro_small", "bpe_ro_large", "bpe_zh_small", "bpe_zh_large", "word_custom", "bpe_custom"]
        },
        "trainer_config.encoder_config.emb_trainable": {
          "description": "This field determines if fine-tuning of the token embeddings is enabled. Examples of token embedding are word or byte pair encoding (BPE) token vectors. If set, it can improve the quality of the model if the query contains less natural language that negatively impacts training. Because the embeddings layer is the largest layer in the network, the process to improve the model requires enough training data to prevent overfitting.",
          "title": "trainer_config.encoder_config.emb_trainable",
          "type": "boolean",
          "hints": ["advanced"],
          "default": "eCommerce=true, general=false"
        },
        "trainer_config.encoder_config.emb_spdp": {
          "description": "This field provides a regularization effect, which is the process to simplify result answers. The regularization is applied between the token embeddings layer and the first recurrent neural network (RNN) layer.\nIt is rare for this parameter field to a require change from the default.",
          "title": "trainer_config.encoder_config.emb_spdp",
          "type": "float",
          "minimum": 0,
          "maximum": 1,
          "hints": ["advanced"],
          "default": 0.3,
          "enum": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        },
        "trainer_config.encoder_config.rnn_names_list": {
          "description": "This field determines which bi-directional recurrent neural network (RNN) layers are used. The length of this list must be matched to the list length on the trainer_config.encoder_config.rnn_names_list",
          "title": "trainer_config.encoder_config.rnn_names_list",
          "type": "List <string>",
          "hints": ["advanced"],
          "default": "[ 'gru' ]",
          "enum": ["gru", "lstm"]
        },
        "trainer_config.encoder_config.rnn_units_list": {
          "description": "The number of units for each recurrent neural network (RNN) layer.\nBecause this is a bi-directional RNN, the encoder’s vector size is two times larger than the number of units in the last layer. For example, if one layer is 128 units, the output vector size is 256.",
          "title": "trainer_config.encoder_config.rnn_units_list",
          "type": "List <integer>",
          "hints": ["advanced"],
          "default": "[ 128 ]",
          "enum": [16, 32, 64, 128, 256, 512]
        },
        "trainer_config.num_epochs": {
          "description": "The number of epochs the training data must complete. An epoch is a full cycle where training data passes through the designated algorithms. During one epoch, the model processes all the training data examples (queries and index documents) at least one time.",
          "title": "trainer_config.num_epochs",
          "type": "integer",
          "hints": ["advanced"],
          "default": 64,
          "minimum": 1
        },
        "trainer_config.monitor_patience": {
          "description": "The number of epochs the training passes before it stops if there is no validation metric improvement during the epochs. The best model state based on the monitor validation metric is used as the final model.\nMonitor patience and monitor metric are interdependent.",
          "title": "trainer_config.monitor_patience",
          "type": "integer",
          "hints": ["advanced"],
          "minimum": 1,
          "default": "eCommerce=16, general=8"
        },
        "trainer_config.trn_batch_size": {
          "description": "The batch size to be used for a single model training update. By default, an appropriate batch size is automatically determined based on the dataset size. If the field is set to `null`, the batch size is also automatically determined based on the dataset size.",
          "title": "trainer_config.trn_batch_size",
          "type": "integer",
          "hints": ["advanced"],
          "minimum": 1,
          "default": "null"
        }
      }
    }
  }
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/lw-platform/lw-ai/lw-ai-custom-embedding-model-training/custom-embedding-model-configuration

[mintlify link]: https://doc.lucidworks.com/docs/lw-platform/lw-ai/lw-ai-custom-embedding-model-training/custom-embedding-model-configuration

[old doc.lw link]: https://doc.lucidworks.com/lw-platform/ai/tnwswb

<LwTemplate />

## Custom configuration

Custom configuration is used to train models with advanced parameters in the [Custom model training user interface](/docs/lw-platform/lw-ai/lw-ai-custom-embedding-model-training/custom-model-training-user-interface).

## Custom configuration scenarios

Custom configuration is typically used in the following scenarios:

* Training using the [Lucidworks AI Models API](/docs/lw-platform/lw-ai/lw-ai-apis/lw-ai-models-api), which requires the minimal, required custom configuration JSON.
* Training with advanced parameters using either the [Custom model training user interface](/docs/lw-platform/lw-ai/lw-ai-custom-embedding-model-training/custom-model-training-user-interface) or the [Lucidworks AI Models API](/docs/lw-platform/lw-ai/lw-ai-apis/lw-ai-models-api).

## Custom configuration parameters

This section contains the most commonly used and important configuration parameters. You can use these parameters in the:

* [Custom model training user interface](/docs/lw-platform/lw-ai/lw-ai-custom-embedding-model-training/custom-model-training-user-interface) > [Create a new model > Manual entry > Custom Config](/docs/lw-platform/lw-ai/lw-ai-custom-embedding-model-training/create-and-deploy-custom-models#manual-entry) field
* [Lucidworks AI Models API](/docs/lw-platform/lw-ai/lw-ai-apis/lw-ai-models-api)

### Model type

To set the dataset and training defaults, enter the appropriate value in the `dataset_config` and `trainer_config` fields:

* `mlp_general_rnn`. This is used for the general recurrent neural networks (RNN) model type.
* `mlp_ecommerce_rnn`. This is used for an ecommerce RNN model type.

For example, to set General RNN model type, use:

<Tabs>
  <Tab title="General RNN model">
    ```json wrap theme={"dark"}
    {
      "dataset_config": "mlp_general",
      "trainer_config": "mlp_general_rnn"
    }
    ```
  </Tab>

  <Tab title="Ecommerce RNN model">
    ```json wrap theme={"dark"}
    {
      "dataset_config": "mlp_ecommerce",
      "trainer_config": "mlp_ecommerce_rnn"
    }
    ```
  </Tab>
</Tabs>

### Model parameters

The parameters nested in `trainer_config` let you set training and model encoder parameters. The most important parameters are:

* `"trainer_config/text_processor_config": "word_en"`. Determines which type of tokenization and embedding is used as the base for the recurrent neural network (RNN) model. For example, word or byte-pair encoding (BPE). For information about values, see [Text processor](/docs/lw-platform/lw-ai/lw-ai-custom-embedding-model-training/custom-embedding-rnn-models#text-processor).

  <Tip>
    **Important**
    The syntax for the text\_processor\_config parameter name must use a forward slash `/` and not a period `.` because it sets a group of parameters using a group name. Other nested parameters use a period `.`.
  </Tip>

* `"trainer_config.encoder_config.rnn_names_list": ["gru"]`. Determines which bi-directional recurrent neural network (RNN) layers are used. Options include `gru` and `lstm`.

* `"trainer_config.encoder_config.rnn_units_list": [128]`. The number of units for each recurrent neural network (RNN) layer.\
  Because this is a bi-directional RNN, the encoder’s vector size is two times larger than the number of units in the last layer. For example, if one layer is 128 units, the output vector size is 256.

You must specify the same number of units for `trainer_config.encoder_config.rnn_units_list` and its similarly-named `trainer_config.encoder_config.rnn_names_list` RNN layer. For example, `rnn_units_list` needs to be the same size as `rnn_names_list`.

## Advanced custom configuration parameters

This section describes the most common advanced custom configuration parameters you can alter. Modifying the values does not typically provide a significant boost in quality. However, setting values incorrectly may cause serious quality degradation.

### Advanced model parameters

* `"trainer_config.trn_batch_size": null`. The batch size to be used for a single model training update. By default, an appropriate batch size is automatically determined based on the dataset size. If the field is set to `null`, the batch size is also automatically determined based on the dataset size.
* `"trainer_config.num_epochs": 64`. The number of epochs the training data must complete. An epoch is a full cycle where training data passes through the designated algorithms. During one epoch, the model processes all the training data examples (queries and index documents) at least one time.
* `"trainer_config.monitor_patience": 8`. The number of epochs the training passes before it stops if there is no validation metric improvement during the epochs. The best model state based on the monitor validation metric is used as the final model.

  * For the general RNN, the `mrr@3` metric is monitored and the `monitor_patience` default value is 8.
  * For the ecommerce RNN, the `ndcg@5` metric is monitored and the `monitor_patience` default value is 16.
* `"trainer_config.encoder_config.emb_spdp": 0.3`. This field provides a regularization effect, which is the process to simplify result answers. The regularization is applied between the token embeddings layer and the first recurrent neural network (RNN) layer.
* `"trainer_config.encoder_config.emb_trainable"`. This field determines if fine-tuning of the token embeddings is enabled. Examples of token embedding are word or byte pair encoding (BPE) token vectors. If set, it can improve the quality of the model if the query contains less natural language that negatively impacts training. Because the embeddings layer is the largest layer in the network, the process to improve the model requires enough training data to prevent overfitting.\
  The default values are:

  * `"trainer_config.encoder_config.emb_trainable": false`. For `mlp_general` models.
  * `"trainer_config.encoder_config.emb_trainable": true`. For `mlp_ecommerce` models.

## Custom configuration examples

To create a custom configuration, set a `dataset_config` and `trainer_config`. To minimize diminished quality in the training model, only change field parameters that deviate from the default.

<Note>
  For detailed information about `dataset_config` for the index and query files, see [use case training data](/docs/lw-platform/lw-ai/lw-ai-custom-embedding-model-training/custom-embedding-model-training-data).
</Note>

<AccordionGroup>
  <Accordion title="General configuration">
    This configuration uses all of the defaults for general RNN training, since no values deviating from the defaults are specified.

    In most cases, this configuration is sufficient if all of these apply:

    * Index Parquet file contains the `pkid` and `text` columns
    * Query Parquet file contains the `pkid` and `query` columns

    <Tabs>
      <Tab title="Basic configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_general",
          "trainer_config": "mlp_general_rnn"
        }
        ```
      </Tab>

      <Tab title="Advanced configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_general",
          "dataset_config.pkid_col_name": "pkid",
          "dataset_config.index_title_col_name": null,
          "dataset_config.index_desc_col_name": "text",
          "dataset_config.index_body_col_name": null,
          "dataset_config.query_col_name": "query",
          "dataset_config.weight_col_name": null,
          "dataset_config.metrics_config.monitor_metric": "mrr@3",
          "trainer_config": "mlp_general_rnn",
          "trainer_config/text_processor_config": "word_en",
          "trainer_config.encoder_config.emb_trainable": false,
          "trainer_config.encoder_config.emb_spdp": 0.3,
          "trainer_config.encoder_config.rnn_names_list": ["gru"],
          "trainer_config.encoder_config.rnn_units_list": [128],
          "trainer_config.num_epochs": 64,
          "trainer_config.monitor_patience": 8,
          "trainer_config.trn_batch_size": null
        }
        ```
      </Tab>
    </Tabs>
  </Accordion>

  <Accordion title="General configuration with multilingual BPE embeddings">
    This configuration uses all of the defaults for general RNN training except `"trainer_config/text_processor_config": "bpe_multi"`. No other values deviate from the defaults.

    This configuration is sufficient if all of these apply:

    * Index Parquet file contains the `pkid` and `text` columns
    * Query Parquet file contains the `pkid` and `query` columns
    * Index and query data are composed of multilingual text

    <Tabs>
      <Tab title="Basic configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_general",
          "trainer_config": "mlp_general_rnn",
          "trainer_config/text_processor_config": "bpe_multi"
        }
        ```
      </Tab>

      <Tab title="Advanced configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_general",
          "dataset_config.pkid_col_name": "pkid",
          "dataset_config.index_title_col_name": null,
          "dataset_config.index_desc_col_name": "text",
          "dataset_config.index_body_col_name": null,
          "dataset_config.query_col_name": "query",
          "dataset_config.weight_col_name": null,
          "dataset_config.metrics_config.monitor_metric": "mrr@3",
          "trainer_config": "mlp_general_rnn",
          "trainer_config/text_processor_config": "bpe_multi",
          "trainer_config.encoder_config.emb_trainable": false,
          "trainer_config.encoder_config.emb_spdp": 0.3,
          "trainer_config.encoder_config.rnn_names_list": ["gru"],
          "trainer_config.encoder_config.rnn_units_list": [128],
          "trainer_config.num_epochs": 64,
          "trainer_config.monitor_patience": 8,
          "trainer_config.trn_batch_size": null
        }
        ```
      </Tab>
    </Tabs>
  </Accordion>

  <Accordion title="General configuration with token embeddings fine-tuning">
    This configuration uses all of the defaults for general RNN training except `"trainer_config.encoder_config.emb_trainable": true`, which enables embedding training. No other values deviate from the defaults.

    This configuration is sufficient if all of these apply:

    * Index Parquet file contains the `pkid` and `text` columns
    * Query Parquet file contains the `pkid` and `query` columns
    * Your data contains a significant number of business-specific or misspelled words, such as in ecommerce use cases

    <Tabs>
      <Tab title="Basic configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_general",
          "trainer_config": "mlp_general",
          "trainer_config.encoder_config.emb_trainable": true
        }
        ```
      </Tab>

      <Tab title="Advanced configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_general",
          "dataset_config.pkid_col_name": "pkid",
          "dataset_config.index_title_col_name": null,
          "dataset_config.index_desc_col_name": "text",
          "dataset_config.index_body_col_name": null,
          "dataset_config.query_col_name": "query",
          "dataset_config.weight_col_name": null,
          "dataset_config.metrics_config.monitor_metric": "mrr@3",
          "trainer_config": "mlp_general_rnn",
          "trainer_config/text_processor_config": "word_en",
          "trainer_config.encoder_config.emb_trainable": true,
          "trainer_config.encoder_config.emb_spdp": 0.3,
          "trainer_config.encoder_config.rnn_names_list": ["gru"],
          "trainer_config.encoder_config.rnn_units_list": [128],
          "trainer_config.num_epochs": 64,
          "trainer_config.monitor_patience": 8,
          "trainer_config.trn_batch_size": null
        }
        ```
      </Tab>
    </Tabs>
  </Accordion>

  <Accordion title="Classification configuration for custom embedding model">
    The custom embedding models for classification use the following parameters:

    * The Index file contains:
      * `dataset_config.pkid_col_name` where `label` is the default value
      * `dataset_config.index_title_col_name` where `label` is the default value
      * `dataset_config.index_desc_col_name` where `null` is the default value
      * `dataset_config.index_body_col_name` where `null` is the default value

    * The Query file contains:
      * `dataset_config.pkid_col_name` where `pkid` is the default value and is used for `class` values
      * `dataset_config.query_col_name` where freeform text is the default value
      * `dataset_config.weight_col_name` where `null` is the default value in this positive numeric field

    <Tabs>
      <Tab title="Ecommerce example">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_classification",
          "trainer_config": "mlp_ecommerce_rnn"
        }
        ```
      </Tab>

      <Tab title="General example">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_classification",
          "trainer_config": "mlp_general_rnn"
        }
        ```
      </Tab>

      <Tab title="Transformer example">
        The basic transformer example uses the `"trainer_config": "mlp_transformer"` value.\
        For example, if the transformer is `gte_large_rnn`, use:

        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_classification",
          "trainer_config": "gte_large_rnn"
        }
        ```
      </Tab>
    </Tabs>

    For more information about the configuration parameters, see [Configuration](#configuration).
  </Accordion>

  <Accordion title="Ecommerce configuration">
    This configuration uses all of the defaults for ecommerce RNN training, since no values deviate from the defaults.

    In most cases, this configuration is sufficient if all of these apply:

    * Index Parquet file contains the `pkid` and `name` columns
    * Query Parquet file contains the `pkid`, `query`, and `weight` columns

    <Tabs>
      <Tab title="Basic configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_ecommerce",
          "trainer_config": "mlp_ecommerce_rnn"
        }
        ```
      </Tab>

      <Tab title="Advanced configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_ecommerce",
          "dataset_config.pkid_col_name": "pkid",
          "dataset_config.index_title_col_name": "name",
          "dataset_config.index_desc_col_name": null,
          "dataset_config.index_body_col_name": null,
          "dataset_config.query_col_name": "query",
          "dataset_config.weight_col_name": "aggr_count",
          "dataset_config.metrics_config.monitor_metric": "ndcg@5",
          "trainer_config": "mlp_ecommerce_rnn",
          "trainer_config/text_processor_config": "word_en",
          "trainer_config.encoder_config.emb_trainable": true,
          "trainer_config.encoder_config.emb_spdp": 0.3,
          "trainer_config.encoder_config.rnn_names_list": ["gru"],
          "trainer_config.encoder_config.rnn_units_list": [128],
          "trainer_config.num_epochs": 64,
          "trainer_config.monitor_patience": 16,
          "trainer_config.trn_batch_size": null
        }
        ```
      </Tab>
    </Tabs>
  </Accordion>

  <Accordion title="Ecommerce configuration with Japanese small BPE embeddings">
    This configuration uses all of the defaults for ecommerce RNN training except `"trainer_config/text_processor_config": "bpe_ja_small"`. No other values deviate from the defaults.

    This configuration is sufficient if all of these apply:

    * Index Parquet file contains the `pkid` and `text` columns
    * Query Parquet file contains the `pkid` and `query` columns
    * Index and query data are composed of Japanese text

    <Tabs>
      <Tab title="Basic configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_ecommerce",
          "trainer_config": "mlp_ecommerce_rnn",
          "trainer_config/text_processor_config": "bpe_ja_small"
        }
        ```
      </Tab>

      <Tab title="Advanced configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_ecommerce",
          "dataset_config.pkid_col_name": "pkid",
          "dataset_config.index_title_col_name": "name",
          "dataset_config.index_desc_col_name": null,
          "dataset_config.index_body_col_name": null,
          "dataset_config.query_col_name": "query",
          "dataset_config.weight_col_name": "aggr_count",
          "dataset_config.metrics_config.monitor_metric": "ndcg@5",
          "trainer_config": "mlp_ecommerce_rnn",
          "trainer_config/text_processor_config": "bpe_ja_small",
          "trainer_config.encoder_config.emb_trainable": true,
          "trainer_config.encoder_config.emb_spdp": 0.3,
          "trainer_config.encoder_config.rnn_names_list": ["gru"],
          "trainer_config.encoder_config.rnn_units_list": [128],
          "trainer_config.num_epochs": 64,
          "trainer_config.monitor_patience": 16,
          "trainer_config.trn_batch_size": null
        }
        ```
      </Tab>
    </Tabs>
  </Accordion>

  <Accordion title="Ecommerce configuration with 2 RNN layers and 128 output vector size">
    This configuration uses all of the defaults for ecommerce RNN training except `"trainer_config.encoder_config.rnn_names_list": ["gru", "gru"]`, which adds an additional GRU layer, and `"trainer_config.encoder_config.rnn_units_list": [128, 64]"`, which specifies 64 units for the second GRU layer. No other values deviate from the defaults.

    This configuration is sufficient if all of these apply:

    * Index Parquet file contains the `pkid` and `text` columns
    * Query Parquet file contains the `pkid` and `query` columns
    * The output of the model is 128 vector dimension

    <Tabs>
      <Tab title="Basic configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_ecommerce",
          "trainer_config": "mlp_ecommerce_rnn",
          "trainer_config.encoder_config.rnn_names_list": ["gru", "gru"],
          "trainer_config.encoder_config.rnn_units_list": [128, 64]
        }
        ```
      </Tab>

      <Tab title="Advanced configuration">
        ```json wrap theme={"dark"}
        {
          "dataset_config": "mlp_ecommerce",
          "dataset_config.pkid_col_name": "pkid",
          "dataset_config.index_title_col_name": "name",
          "dataset_config.index_desc_col_name": null,
          "dataset_config.index_body_col_name": null,
          "dataset_config.query_col_name": "query",
          "dataset_config.weight_col_name": "aggr_count",
          "dataset_config.metrics_config.monitor_metric": "ndcg@5",
          "trainer_config": "mlp_ecommerce_rnn",
          "trainer_config/text_processor_config": "word_en",
          "trainer_config.encoder_config.emb_trainable": true,
          "trainer_config.encoder_config.emb_spdp": 0.3,
          "trainer_config.encoder_config.rnn_names_list": ["gru", "gru"],
          "trainer_config.encoder_config.rnn_units_list": [128, 64],
          "trainer_config.num_epochs": 64,
          "trainer_config.monitor_patience": 16,
          "trainer_config.trn_batch_size": null
        }
        ```
      </Tab>
    </Tabs>
  </Accordion>

  <Accordion title="General configuration with all_minilm_l6_rnn transformer">
    This configuration uses the `all_minilm_l6_rnn` transformer.

    ```json wrap  theme={"dark"}
    {
      "dataset_config": "mlp_ecommerce",
      "trainer_config": "all_minilm_l6_rnn"
    }
    ```
  </Accordion>

  <Accordion title="Ecommerce configuration with all_minilm_l6_rnn transformer">
    This configuration uses the `all_minilm_l6_rnn` transformer.

    ```json wrap  theme={"dark"}
    {
      "dataset_config": "mlp_ecommerce",
      "trainer_config": "all_minilm_l6_rnn"
    }
    ```
  </Accordion>
</AccordionGroup>

## Configuration

<SchemaParamFields schema={schema} />
