Module: LLaMACpp
- Defined in:
- lib/llama_cpp.rb,
lib/llama_cpp/version.rb,
ext/llama_cpp/dummy.rb,
ext/llama_cpp/dummy.rb
Overview
llama_cpp.rb provides Ruby bindings for the llama.cpp.
Defined Under Namespace
Classes: Batch, Context, ContextParams, Grammar, GrammarElement, Model, ModelKVOverride, ModelParams, ModelQuantizeParams, Timings, TokenData, TokenDataArray
Constant Summary collapse
- VERSION =
The version of llama_cpp.rb you install.
'0.17.10'
- LLAMA_CPP_VERSION =
The supported version of llama.cpp.
'b3676'
- LLAMA_FILE_MAGIC_GGLA =
LLaMA file magic.
'0x67676c61u'
- LLAMA_FILE_MAGIC_GGSN =
LLaMA file magic.
'0x6767736eu'
- LLAMA_FILE_MAGIC_GGSQ =
LLaMA file magic.
'0x67677371u'
- LLAMA_SESSION_MAGIC =
LLaMA session magic.
'0x6767736e'
- LLAMA_STATE_SEQ_MAGIC =
LLaMA state seq magic.
'0x67677371u'
- LLAMA_SESSION_VERSION =
LLaMA session version.
'8'
- LLAMA_STATE_SEQ_VERSION =
LLaMA state seq version.
'2'
- LLAMA_DEFALUT_SEED =
LLaMA default random seed.
'0xFFFFFFFF'
- LLAMA_VOCAB_TYPE_NONE =
LLaMA vocabulary type.
0
- LLAMA_VOCAB_TYPE_SPM =
LLaMA vocabulary type.
1
- LLAMA_VOCAB_TYPE_BPE =
LLaMA vocabulary type.
2
- LLAMA_VOCAB_TYPE_WPM =
LLaMA vocabulary type.
3
- LLAMA_VOCAB_TYPE_UGM =
LLaMA vocabulary type.
4
- LLAMA_VOCAB_TYPE_RWKV =
LLaMA vocabulary type.
5
- LLAMA_VOCAB_PRE_TYPE_DEFAULT =
LLaMA vocabulary pre-tokenization type.
0
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 =
LLaMA vocabulary pre-tokenization type.
1
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM =
LLaMA vocabulary pre-tokenization type.
2
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER =
LLaMA vocabulary pre-tokenization type.
3
- LLAMA_VOCAB_PRE_TYPE_FALCON =
LLaMA vocabulary pre-tokenization type.
4
- LLAMA_VOCAB_PRE_TYPE_MPT =
LLaMA vocabulary pre-tokenization type.
5
- LLAMA_VOCAB_PRE_TYPE_STARCODER =
LLaMA vocabulary pre-tokenization type.
6
- LLAMA_VOCAB_PRE_TYPE_GPT2 =
LLaMA vocabulary pre-tokenization type.
7
- LLAMA_VOCAB_PRE_TYPE_REFACT =
LLaMA vocabulary pre-tokenization type.
8
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R =
LLaMA vocabulary pre-tokenization type.
9
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 =
LLaMA vocabulary pre-tokenization type.
10
- LLAMA_VOCAB_PRE_TYPE_QWEN2 =
LLaMA vocabulary pre-tokenization type.
11
- LLAMA_VOCAB_PRE_TYPE_OLMO =
LLaMA vocabulary pre-tokenization type.
12
- LLAMA_VOCAB_PRE_TYPE_DBRX =
LLaMA vocabulary pre-tokenization type.
13
- LLAMA_VOCAB_PRE_TYPE_SMAUG =
LLaMA vocabulary pre-tokenization type.
14
- LLAMA_VOCAB_PRE_TYPE_PORO =
LLaMA vocabulary pre-tokenization type.
15
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 =
LLaMA vocabulary pre-tokenization type.
16
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 =
LLaMA vocabulary pre-tokenization type.
17
- LLAMA_VOCAB_PRE_TYPE_VIKING =
LLaMA vocabulary pre-tokenization type.
18
- LLAMA_VOCAB_PRE_TYPE_JAIS =
LLaMA vocabulary pre-tokenization type.
19
- LLAMA_VOCAB_PRE_TYPE_TEKKEN =
LLaMA vocabulary pre-tokenization type.
20
- LLAMA_VOCAB_PRE_TYPE_SMOLLM =
LLaMA vocabulary pre-tokenization type.
21
- LLAMA_VOCAB_PRE_TYPE_CODESHELL =
LLaMA vocabulary pre-tokenization type.
22
- LLAMA_VOCAB_PRE_TYPE_BLOOM =
LLaMA vocabulary pre-tokenization type.
23
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH =
LLaMA vocabulary pre-tokenization type.
24
- LLAMA_VOCAB_PRE_TYPE_EXAONE =
LLaMA vocabulary pre-tokenization type.
25
- LLAMA_TOKEN_ATTR_UNDEFINED =
LLaMA token attribute type.
0
- LLAMA_TOKEN_ATTR_UNKNOWN =
LLaMA token attribute type.
1
- LLAMA_TOKEN_ATTR_UNUSED =
LLaMA token attribute type.
2
- LLAMA_TOKEN_ATTR_NORMAL =
LLaMA token attribute type.
4
- LLAMA_TOKEN_ATTR_CONTROL =
LLaMA token attribute type.
8
- LLAMA_TOKEN_ATTR_USER_DEFINED =
LLaMA token attribute type.
16
- LLAMA_TOKEN_ATTR_BYTE =
LLaMA token attribute type.
32
- LLAMA_TOKEN_ATTR_NORMALIZED =
LLaMA token attribute type.
64
- LLAMA_TOKEN_ATTR_LSTRIP =
LLaMA token attribute type.
128
- LLAMA_TOKEN_ATTR_RSTRIP =
LLaMA token attribute type.
256
- LLAMA_TOKEN_ATTR_SINGLE_WORD =
LLaMA token attribute type.
512
- LLAMA_FTYPE_ALL_F32 =
LLaMA model file type.
0
- LLAMA_FTYPE_MOSTLY_F16 =
LLaMA model file type.
1
- LLAMA_FTYPE_MOSTLY_Q4_0 =
LLaMA model file type.
2
- LLAMA_FTYPE_MOSTLY_Q4_1 =
LLaMA model file type.
3
- LLAMA_FTYPE_MOSTLY_Q8_0 =
LLaMA model file type.
7
- LLAMA_FTYPE_MOSTLY_Q5_0 =
LLaMA model file type.
8
- LLAMA_FTYPE_MOSTLY_Q5_1 =
LLaMA model file type.
9
- LLAMA_FTYPE_MOSTLY_Q2_K =
LLaMA model file type.
10
- LLAMA_FTYPE_MOSTLY_Q3_K_S =
LLaMA model file type.
11
- LLAMA_FTYPE_MOSTLY_Q3_K_M =
LLaMA model file type.
12
- LLAMA_FTYPE_MOSTLY_Q3_K_L =
LLaMA model file type.
13
- LLAMA_FTYPE_MOSTLY_Q4_K_S =
LLaMA model file type.
14
- LLAMA_FTYPE_MOSTLY_Q4_K_M =
LLaMA model file type.
15
- LLAMA_FTYPE_MOSTLY_Q5_K_S =
LLaMA model file type.
16
- LLAMA_FTYPE_MOSTLY_Q5_K_M =
LLaMA model file type.
17
- LLAMA_FTYPE_MOSTLY_Q6_K =
LLaMA model file type.
18
- LLAMA_FTYPE_MOSTLY_IQ2_XXS =
LLaMA model file type.
19
- LLAMA_FTYPE_MOSTLY_IQ2_XS =
LLaMA model file type.
20
- LLAMA_FTYPE_MOSTLY_Q2_K_S =
LLaMA model file type.
21
- LLAMA_FTYPE_MOSTLY_IQ3_XS =
LLaMA model file type.
22
- LLAMA_FTYPE_MOSTLY_IQ3_XXS =
LLaMA model file type.
23
- LLAMA_FTYPE_MOSTLY_IQ1_S =
LLaMA model file type.
24
- LLAMA_FTYPE_MOSTLY_IQ4_NL =
LLaMA model file type.
25
- LLAMA_FTYPE_MOSTLY_IQ3_S =
LLaMA model file type.
26
- LLAMA_FTYPE_MOSTLY_IQ3_M =
LLaMA model file type.
27
- LLAMA_FTYPE_MOSTLY_IQ2_S =
LLaMA model file type.
28
- LLAMA_FTYPE_MOSTLY_IQ2_M =
LLaMA model file type.
29
- LLAMA_FTYPE_MOSTLY_IQ4_XS =
LLaMA model file type.
30
- LLAMA_FTYPE_MOSTLY_IQ1_M =
LLaMA model file type.
31
- LLAMA_FTYPE_MOSTLY_BF16 =
LLaMA model file type.
32
- LLAMA_FTYPE_MOSTLY_Q4_0_4_4 =
LLaMA model file type.
33
- LLAMA_FTYPE_MOSTLY_Q4_0_4_8 =
LLaMA model file type.
34
- LLAMA_FTYPE_MOSTLY_Q4_0_8_8 =
LLaMA model file type.
35
- LLAMA_FTYPE_MOSTLY_TQ1_0 =
LLaMA model file type.
36
- LLAMA_FTYPE_MOSTLY_TQ2_0 =
LLaMA model file type.
37
- LLAMA_FTYPE_GUESSED =
LLaMA model file type (not specified in the model file).
1024
- LLAMA_KV_OVERRIDE_TYPE_INT =
LLaMA KV override type.
0
- LLAMA_KV_OVERRIDE_TYPE_FLOAT =
LLaMA KV override type.
1
- LLAMA_KV_OVERRIDE_TYPE_BOOL =
LLaMA KV override type.
2
- LLAMA_KV_OVERRIDE_TYPE_STR =
LLaMA KV override type.
3
- LLAMA_GRETYPE_END =
GrammarElement type: end of rule definition.
0
- LLAMA_GRETYPE_ALT =
GrammarElement type: start of alternate definition for rule.
1
- LLAMA_GRETYPE_RULE_REF =
GrammarElement type: non-terminal element: reference to rule.
2
- LLAMA_GRETYPE_CHAR =
GrammarElement type: terminal element: character (code point).
3
- LLAMA_GRETYPE_CHAR_NOT =
GrammarElement type: inverse char(s) ([^a], [^a-b] [^abc]).
4
- LLAMA_GRETYPE_CHAR_RNG_UPPER =
GrammarElement type: modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to be an inclusive range ([a-z]).
5
- LLAMA_GRETYPE_CHAR_ALT =
GrammarElement type: modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]).
6
- LLAMA_GRETYPE_CHAR_ANY =
GrammarElement type: any character (.)
7
- LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED =
RoPE scaling type.
-1
- LLAMA_ROPE_SCALING_TYPE_NONE =
RoPE scaling type.
0
- LLAMA_ROPE_SCALING_TYPE_LINEAR =
RoPE scaling type.
1
- LLAMA_ROPE_SCALING_TYPE_YARN =
RoPE scaling type.
2
- LLAMA_ROPE_SCALING_TYPE_MAX_VALUE =
RoPE scaling type.
LLAMA_ROPE_SCALING_TYPE_YARN
- LLAMA_POOLING_TYPE_UNSPECIFIED =
LLaMA pooling type.
-1
- LLAMA_POOLING_TYPE_NONE =
LLaMA pooling type.
0
- LLAMA_POOLING_TYPE_MEAN =
LLaMA pooling type.
1
- LLAMA_POOLING_TYPE_CLS =
LLaMA pooling type.
2
- LLAMA_POOLING_TYPE_LAST =
LLaMA pooling type.
3
- LLAMA_ATTENTION_TYPE_UNSPECIFIED =
LLaMA attention type.
-1
- LLAMA_ATTENTION_TYPE_CAUSAL =
LLaMA attention type.
0
- LLAMA_ATTENTION_TYPE_NON_CAUSAL =
LLaMA attention type.
1
- LLAMA_SPLIT_MODE_NONE =
LLaMA split mode: single GPU.
0
- LLAMA_SPLIT_MODE_LAYER =
LLaMA split mode: split layers and KV across GPUs.
1
- LLAMA_SPLIT_MODE_ROW =
LLaMA split mode: split rows across GPUs.
2
Class Method Summary collapse
-
.backend_free ⇒ Object
Finalizes the backend, currently only used for MPI.
-
.backend_init ⇒ Object
Initializes the backend.
-
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
-
.max_devices ⇒ Integer
Returns the maximum number of devices.
-
.model_quantize(input_path:, output_path:, params:) ⇒ Object
Quantizes the model.
-
.numa_init(strategy) ⇒ Object
Initializes NUMA.
-
.print_system_info ⇒ Object
Prints system information.
-
.supports_gpu_offload? ⇒ Boolean
Returns the flag for supporting GPU offload.
-
.supports_mlock? ⇒ Boolean
Returns the flag for supporting mlock.
-
.supports_mmap? ⇒ Boolean
Returns the flag for supporting mmap.
-
.time_us ⇒ Integer
Returns the time.
Class Method Details
.backend_free ⇒ Object
Finalizes the backend, currently only used for MPI.
358 |
# File 'ext/llama_cpp/dummy.rb', line 358 def backend_free; end |
.backend_init ⇒ Object
Initializes the backend.
355 |
# File 'ext/llama_cpp/dummy.rb', line 355 def backend_init; end |
.generate(context, prompt, n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) ⇒ String
Generates sentences following the given prompt for operation check.
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/llama_cpp.rb', line 27 def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity n_predict: 128, n_keep: 10, n_batch: 512, repeat_last_n: 64, repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8) raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context) raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String) spaced_prompt = " #{prompt}" embd_input = context.model.tokenize(text: spaced_prompt, add_bos: true) n_ctx = context.n_ctx raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4 last_n_tokens = [0] * n_ctx embd = [] n_consumed = 0 n_past = 0 n_remain = n_predict n_vocab = context.model.n_vocab output = [] while n_remain != 0 unless embd.empty? if n_past + embd.size > n_ctx n_left = n_past - n_keep n_past = n_keep embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size]) end context.decode(LLaMACpp::Batch.get_one(tokens: embd, n_tokens: embd.size, pos_zero: n_past, seq_id: 0)) end n_past += embd.size embd.clear if embd_input.size <= n_consumed logits = context.logits base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) } candidates = LLaMACpp::TokenDataArray.new(base_candidates) # apply penalties last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min context.sample_repetition_penalties( candidates, last_n_tokens[-last_n_repeat..], penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence ) # temperature sampling context.sample_top_k(candidates, k: top_k) context.sample_tail_free(candidates, z: tfs_z) context.sample_typical(candidates, prob: typical_p) context.sample_top_p(candidates, prob: top_p) context.sample_temp(candidates, temp: temperature) id = context.sample_token(candidates) last_n_tokens.shift last_n_tokens.push(id) embd.push(id) n_remain -= 1 else while embd_input.size > n_consumed embd.push(embd_input[n_consumed]) last_n_tokens.shift last_n_tokens.push(embd_input[n_consumed]) n_consumed += 1 break if embd.size >= n_batch end end embd.each { |token| output << context.model.token_to_piece(token) } break if !embd.empty? && embd[-1] == context.model.token_eos end output.join.scrub('?').strip.delete_prefix(prompt).strip end |
.max_devices ⇒ Integer
Returns the maximum number of devices.
381 |
# File 'ext/llama_cpp/dummy.rb', line 381 def max_devices; end |
.model_quantize(input_path:, output_path:, params:) ⇒ Object
Quantizes the model.
370 |
# File 'ext/llama_cpp/dummy.rb', line 370 def model_quantize(input_path:, output_path:, params:); end |
.numa_init(strategy) ⇒ Object
Initializes NUMA.
363 |
# File 'ext/llama_cpp/dummy.rb', line 363 def numa_init(strategy); end |
.print_system_info ⇒ Object
Prints system information.
373 |
# File 'ext/llama_cpp/dummy.rb', line 373 def print_system_info; end |
.supports_gpu_offload? ⇒ Boolean
Returns the flag for supporting GPU offload.
393 |
# File 'ext/llama_cpp/dummy.rb', line 393 def supports_gpu_offload?; end |
.supports_mlock? ⇒ Boolean
Returns the flag for supporting mlock.
389 |
# File 'ext/llama_cpp/dummy.rb', line 389 def supports_mlock?; end |
.supports_mmap? ⇒ Boolean
Returns the flag for supporting mmap.
385 |
# File 'ext/llama_cpp/dummy.rb', line 385 def supports_mmap?; end |
.time_us ⇒ Integer
Returns the time.
377 |
# File 'ext/llama_cpp/dummy.rb', line 377 def time_us; end |