inference:
  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  temperature: 1.0 # sampling temperature
  add_BOS: True # add the bos token at the begining of the prompt
  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
  all_probs: False  # whether return the log prob for all the tokens in vocab
  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False


trainer:
  devices: 1
  num_nodes: 1
  accelerator: gpu
  logger: False # logger provided by exp_manager
  precision: 16 # 16, 32, or bf16

inference_batch_size: 2
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
retro_model_file: null  # RETRO nemo file path

use_predict_method: False  # whether to use the predict method

prompts: # prompts for RETRO model inference
  - "hello,"
  - "good morning,"
  - "good afternoon,"
  - "good evening,"
 
########### Faiss service parameters ########
retrieval_service:
  strategy: RetroModelTextGenerationStrategy  # choose customized inference strategy 
  neighbors: 4
  frequent_query: False  # for the current token generation, frequently update the retrieval context. If false, update it every 64 tokens 
  pad_tokens: True # pad the tokens at the beginning to make it minimum of 64 tokens for retrieving at least once
  store_retrieved: False # whether store the retrieved documents, so it can be checked
  combo_service:
    service_ip: '0.0.0.0'
    service_port: 17181