Module: LLaMACpp

Defined in:
lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/dummy.rb,
ext/llama_cpp/dummy.rb

Overview

llama_cpp.rb provides Ruby bindings for the llama.cpp.

Defined Under Namespace

Classes: Batch, Context, ContextParams, Grammar, GrammarElement, Model, ModelKVOverride, ModelParams, ModelQuantizeParams, Timings, TokenData, TokenDataArray

Constant Summary collapse

VERSION =

The version of llama_cpp.rb you install.

'0.17.10'
LLAMA_CPP_VERSION =

The supported version of llama.cpp.

'b3676'
LLAMA_FILE_MAGIC_GGLA =

LLaMA file magic.

'0x67676c61u'
LLAMA_FILE_MAGIC_GGSN =

LLaMA file magic.

'0x6767736eu'
LLAMA_FILE_MAGIC_GGSQ =

LLaMA file magic.

'0x67677371u'
LLAMA_SESSION_MAGIC =

LLaMA session magic.

'0x6767736e'
LLAMA_STATE_SEQ_MAGIC =

LLaMA state seq magic.

'0x67677371u'
LLAMA_SESSION_VERSION =

LLaMA session version.

'8'
LLAMA_STATE_SEQ_VERSION =

LLaMA state seq version.

'2'
LLAMA_DEFALUT_SEED =

LLaMA default random seed.

'0xFFFFFFFF'
LLAMA_VOCAB_TYPE_NONE =

LLaMA vocabulary type.

0
LLAMA_VOCAB_TYPE_SPM =

LLaMA vocabulary type.

1
LLAMA_VOCAB_TYPE_BPE =

LLaMA vocabulary type.

2
LLAMA_VOCAB_TYPE_WPM =

LLaMA vocabulary type.

3
LLAMA_VOCAB_TYPE_UGM =

LLaMA vocabulary type.

4
LLAMA_VOCAB_TYPE_RWKV =

LLaMA vocabulary type.

5
LLAMA_VOCAB_PRE_TYPE_DEFAULT =

LLaMA vocabulary pre-tokenization type.

0
LLAMA_VOCAB_PRE_TYPE_LLAMA3 =

LLaMA vocabulary pre-tokenization type.

1
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM =

LLaMA vocabulary pre-tokenization type.

2
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER =

LLaMA vocabulary pre-tokenization type.

3
LLAMA_VOCAB_PRE_TYPE_FALCON =

LLaMA vocabulary pre-tokenization type.

4
LLAMA_VOCAB_PRE_TYPE_MPT =

LLaMA vocabulary pre-tokenization type.

5
LLAMA_VOCAB_PRE_TYPE_STARCODER =

LLaMA vocabulary pre-tokenization type.

6
LLAMA_VOCAB_PRE_TYPE_GPT2 =

LLaMA vocabulary pre-tokenization type.

7
LLAMA_VOCAB_PRE_TYPE_REFACT =

LLaMA vocabulary pre-tokenization type.

8
LLAMA_VOCAB_PRE_TYPE_COMMAND_R =

LLaMA vocabulary pre-tokenization type.

9
LLAMA_VOCAB_PRE_TYPE_STABLELM2 =

LLaMA vocabulary pre-tokenization type.

10
LLAMA_VOCAB_PRE_TYPE_QWEN2 =

LLaMA vocabulary pre-tokenization type.

11
LLAMA_VOCAB_PRE_TYPE_OLMO =

LLaMA vocabulary pre-tokenization type.

12
LLAMA_VOCAB_PRE_TYPE_DBRX =

LLaMA vocabulary pre-tokenization type.

13
LLAMA_VOCAB_PRE_TYPE_SMAUG =

LLaMA vocabulary pre-tokenization type.

14
LLAMA_VOCAB_PRE_TYPE_PORO =

LLaMA vocabulary pre-tokenization type.

15
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 =

LLaMA vocabulary pre-tokenization type.

16
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 =

LLaMA vocabulary pre-tokenization type.

17
LLAMA_VOCAB_PRE_TYPE_VIKING =

LLaMA vocabulary pre-tokenization type.

18
LLAMA_VOCAB_PRE_TYPE_JAIS =

LLaMA vocabulary pre-tokenization type.

19
LLAMA_VOCAB_PRE_TYPE_TEKKEN =

LLaMA vocabulary pre-tokenization type.

20
LLAMA_VOCAB_PRE_TYPE_SMOLLM =

LLaMA vocabulary pre-tokenization type.

21
LLAMA_VOCAB_PRE_TYPE_CODESHELL =

LLaMA vocabulary pre-tokenization type.

22
LLAMA_VOCAB_PRE_TYPE_BLOOM =

LLaMA vocabulary pre-tokenization type.

23
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH =

LLaMA vocabulary pre-tokenization type.

24
LLAMA_VOCAB_PRE_TYPE_EXAONE =

LLaMA vocabulary pre-tokenization type.

25
LLAMA_TOKEN_ATTR_UNDEFINED =

LLaMA token attribute type.

0
LLAMA_TOKEN_ATTR_UNKNOWN =

LLaMA token attribute type.

1
LLAMA_TOKEN_ATTR_UNUSED =

LLaMA token attribute type.

2
LLAMA_TOKEN_ATTR_NORMAL =

LLaMA token attribute type.

4
LLAMA_TOKEN_ATTR_CONTROL =

LLaMA token attribute type.

8
LLAMA_TOKEN_ATTR_USER_DEFINED =

LLaMA token attribute type.

16
LLAMA_TOKEN_ATTR_BYTE =

LLaMA token attribute type.

32
LLAMA_TOKEN_ATTR_NORMALIZED =

LLaMA token attribute type.

64
LLAMA_TOKEN_ATTR_LSTRIP =

LLaMA token attribute type.

128
LLAMA_TOKEN_ATTR_RSTRIP =

LLaMA token attribute type.

256
LLAMA_TOKEN_ATTR_SINGLE_WORD =

LLaMA token attribute type.

512
LLAMA_FTYPE_ALL_F32 =

LLaMA model file type.

0
LLAMA_FTYPE_MOSTLY_F16 =

LLaMA model file type.

1
LLAMA_FTYPE_MOSTLY_Q4_0 =

LLaMA model file type.

2
LLAMA_FTYPE_MOSTLY_Q4_1 =

LLaMA model file type.

3
LLAMA_FTYPE_MOSTLY_Q8_0 =

LLaMA model file type.

7
LLAMA_FTYPE_MOSTLY_Q5_0 =

LLaMA model file type.

8
LLAMA_FTYPE_MOSTLY_Q5_1 =

LLaMA model file type.

9
LLAMA_FTYPE_MOSTLY_Q2_K =

LLaMA model file type.

10
LLAMA_FTYPE_MOSTLY_Q3_K_S =

LLaMA model file type.

11
LLAMA_FTYPE_MOSTLY_Q3_K_M =

LLaMA model file type.

12
LLAMA_FTYPE_MOSTLY_Q3_K_L =

LLaMA model file type.

13
LLAMA_FTYPE_MOSTLY_Q4_K_S =

LLaMA model file type.

14
LLAMA_FTYPE_MOSTLY_Q4_K_M =

LLaMA model file type.

15
LLAMA_FTYPE_MOSTLY_Q5_K_S =

LLaMA model file type.

16
LLAMA_FTYPE_MOSTLY_Q5_K_M =

LLaMA model file type.

17
LLAMA_FTYPE_MOSTLY_Q6_K =

LLaMA model file type.

18
LLAMA_FTYPE_MOSTLY_IQ2_XXS =

LLaMA model file type.

19
LLAMA_FTYPE_MOSTLY_IQ2_XS =

LLaMA model file type.

20
LLAMA_FTYPE_MOSTLY_Q2_K_S =

LLaMA model file type.

21
LLAMA_FTYPE_MOSTLY_IQ3_XS =

LLaMA model file type.

22
LLAMA_FTYPE_MOSTLY_IQ3_XXS =

LLaMA model file type.

23
LLAMA_FTYPE_MOSTLY_IQ1_S =

LLaMA model file type.

24
LLAMA_FTYPE_MOSTLY_IQ4_NL =

LLaMA model file type.

25
LLAMA_FTYPE_MOSTLY_IQ3_S =

LLaMA model file type.

26
LLAMA_FTYPE_MOSTLY_IQ3_M =

LLaMA model file type.

27
LLAMA_FTYPE_MOSTLY_IQ2_S =

LLaMA model file type.

28
LLAMA_FTYPE_MOSTLY_IQ2_M =

LLaMA model file type.

29
LLAMA_FTYPE_MOSTLY_IQ4_XS =

LLaMA model file type.

30
LLAMA_FTYPE_MOSTLY_IQ1_M =

LLaMA model file type.

31
LLAMA_FTYPE_MOSTLY_BF16 =

LLaMA model file type.

32
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 =

LLaMA model file type.

33
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 =

LLaMA model file type.

34
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 =

LLaMA model file type.

35
LLAMA_FTYPE_MOSTLY_TQ1_0 =

LLaMA model file type.

36
LLAMA_FTYPE_MOSTLY_TQ2_0 =

LLaMA model file type.

37
LLAMA_FTYPE_GUESSED =

LLaMA model file type (not specified in the model file).

1024
LLAMA_KV_OVERRIDE_TYPE_INT =

LLaMA KV override type.

0
LLAMA_KV_OVERRIDE_TYPE_FLOAT =

LLaMA KV override type.

1
LLAMA_KV_OVERRIDE_TYPE_BOOL =

LLaMA KV override type.

2
LLAMA_KV_OVERRIDE_TYPE_STR =

LLaMA KV override type.

3
LLAMA_GRETYPE_END =

GrammarElement type: end of rule definition.

0
LLAMA_GRETYPE_ALT =

GrammarElement type: start of alternate definition for rule.

1
LLAMA_GRETYPE_RULE_REF =

GrammarElement type: non-terminal element: reference to rule.

2
LLAMA_GRETYPE_CHAR =

GrammarElement type: terminal element: character (code point).

3
LLAMA_GRETYPE_CHAR_NOT =

GrammarElement type: inverse char(s) ([^a], [^a-b] [^abc]).

4
LLAMA_GRETYPE_CHAR_RNG_UPPER =

GrammarElement type: modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to be an inclusive range ([a-z]).

5
LLAMA_GRETYPE_CHAR_ALT =

GrammarElement type: modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]).

6
LLAMA_GRETYPE_CHAR_ANY =

GrammarElement type: any character (.)

7
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED =

RoPE scaling type.

-1
LLAMA_ROPE_SCALING_TYPE_NONE =

RoPE scaling type.

0
LLAMA_ROPE_SCALING_TYPE_LINEAR =

RoPE scaling type.

1
LLAMA_ROPE_SCALING_TYPE_YARN =

RoPE scaling type.

2
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE =

RoPE scaling type.

LLAMA_ROPE_SCALING_TYPE_YARN
LLAMA_POOLING_TYPE_UNSPECIFIED =

LLaMA pooling type.

-1
LLAMA_POOLING_TYPE_NONE =

LLaMA pooling type.

0
LLAMA_POOLING_TYPE_MEAN =

LLaMA pooling type.

1
LLAMA_POOLING_TYPE_CLS =

LLaMA pooling type.

2
LLAMA_POOLING_TYPE_LAST =

LLaMA pooling type.

3
LLAMA_ATTENTION_TYPE_UNSPECIFIED =

LLaMA attention type.

-1
LLAMA_ATTENTION_TYPE_CAUSAL =

LLaMA attention type.

0
LLAMA_ATTENTION_TYPE_NON_CAUSAL =

LLaMA attention type.

1
LLAMA_SPLIT_MODE_NONE =

LLaMA split mode: single GPU.

0
LLAMA_SPLIT_MODE_LAYER =

LLaMA split mode: split layers and KV across GPUs.

1
LLAMA_SPLIT_MODE_ROW =

LLaMA split mode: split rows across GPUs.

2

Class Method Summary collapse

Class Method Details

.backend_freeObject

Finalizes the backend, currently only used for MPI.



358
# File 'ext/llama_cpp/dummy.rb', line 358

def backend_free; end

.backend_initObject

Initializes the backend.



355
# File 'ext/llama_cpp/dummy.rb', line 355

def backend_init; end

.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String

Generates sentences following the given prompt for operation check.

Parameters:

  • context (LLaMACpp::Context)

    The context to use.

  • prompt (String)

    The prompt to start generation with.

  • n_predict (Integer) (defaults to: 128)

    The number of tokens to predict.

  • n_keep (Integer) (defaults to: 10)

    The number of tokens to keep in the context.

  • n_batch (Integer) (defaults to: 512)

    The number of tokens to process in a batch.

  • repeat_last_n (Integer) (defaults to: 64)

    The number of tokens to consider for repetition penalty.

  • repeat_penalty (Float) (defaults to: 1.1)

    The repetition penalty.

  • frequency (Float) (defaults to: 0.0)

    The frequency penalty.

  • presence (Float) (defaults to: 0.0)

    The presence penalty.

  • top_k (Integer) (defaults to: 40)

    The number of tokens to consider for top-k sampling.

  • top_p (Float) (defaults to: 0.95)

    The probability threshold for nucleus sampling.

  • tfs_z (Float) (defaults to: 1.0)

    The z parameter for tail-free sampling.

  • typical_p (Float) (defaults to: 1.0)

    The probability for typical sampling.

  • temperature (Float) (defaults to: 0.8)

    The temperature for temperature sampling.

Returns:

  • (String)

Raises:

  • (ArgumentError)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/llama_cpp.rb', line 27

def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
             n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64,
             repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
             top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)

  spaced_prompt = " #{prompt}"
  embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true)

  n_ctx = context.n_ctx
  raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4

  last_n_tokens = [0] * n_ctx

  embd = []
  n_consumed = 0
  n_past = 0
  n_remain = n_predict
  n_vocab = context.model.n_vocab
  output = []

  while n_remain != 0
    unless embd.empty?
      if n_past + embd.size > n_ctx
        n_left = n_past - n_keep
        n_past = n_keep
        embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
      end

      context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0))
    end

    n_past += embd.size
    embd.clear

    if embd_input.size <= n_consumed
      logits = context.logits
      base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
      candidates = LLaMACpp::TokenDataArray.new(base_candidates)

      # apply penalties
      last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
      context.sample_repetition_penalties(
        candidates, last_n_tokens[-last_n_repeat..],
        penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
      )

      # temperature sampling
      context.sample_top_k(candidates, k: top_k)
      context.sample_tail_free(candidates, z: tfs_z)
      context.sample_typical(candidates, prob: typical_p)
      context.sample_top_p(candidates, prob: top_p)
      context.sample_temp(candidates, temp: temperature)
      id = context.sample_token(candidates)

      last_n_tokens.shift
      last_n_tokens.push(id)

      embd.push(id)
      n_remain -= 1
    else
      while embd_input.size > n_consumed
        embd.push(embd_input[n_consumed])
        last_n_tokens.shift
        last_n_tokens.push(embd_input[n_consumed])
        n_consumed += 1
        break if embd.size >= n_batch
      end
    end

    embd.each { |token| output << context.model.token_to_piece(token) }

    break if !embd.empty? && embd[-1] == context.model.token_eos
  end

  output.join.scrub('?').strip.delete_prefix(prompt).strip
end

.max_devicesInteger

Returns the maximum number of devices.

Returns:

  • (Integer)


381
# File 'ext/llama_cpp/dummy.rb', line 381

def max_devices; end

.model_quantize(input_path:, output_path:, params:) ⇒ Object

Quantizes the model.

Parameters:

  • input_path (String)

    The path to the input model file.

  • output_path (String)

    The path to the output model file.

  • params (ModelQuantizeParams)

    The paramters for model quantization.



370
# File 'ext/llama_cpp/dummy.rb', line 370

def model_quantize(input_path:, output_path:, params:); end

.numa_init(strategy) ⇒ Object

Initializes NUMA.

Parameters:

  • strategy (Integer)

    The NUMA strategy.



363
# File 'ext/llama_cpp/dummy.rb', line 363

def numa_init(strategy); end

Prints system information.



373
# File 'ext/llama_cpp/dummy.rb', line 373

def print_system_info; end

.supports_gpu_offload?Boolean

Returns the flag for supporting GPU offload.

Returns:

  • (Boolean)


393
# File 'ext/llama_cpp/dummy.rb', line 393

def supports_gpu_offload?; end

.supports_mlock?Boolean

Returns the flag for supporting mlock.

Returns:

  • (Boolean)


389
# File 'ext/llama_cpp/dummy.rb', line 389

def supports_mlock?; end

.supports_mmap?Boolean

Returns the flag for supporting mmap.

Returns:

  • (Boolean)


385
# File 'ext/llama_cpp/dummy.rb', line 385

def supports_mmap?; end

.time_usInteger

Returns the time.

Returns:

  • (Integer)


377
# File 'ext/llama_cpp/dummy.rb', line 377

def time_us; end