From fb8e2fe606f1e1013c7e6b315db433e9f389c16f Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Mon, 23 Oct 2023 06:27:28 -0600 Subject: [PATCH] Add support for loading merges.txt Add --padvocab option to convert.py Other minor cleanups --- convert.py | 36 +++++++++++++++++++++++++++++------- gguf-py/gguf/gguf.py | 32 ++++++++++++++++++++++++++++++++ gguf-py/pyproject.toml | 2 +- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/convert.py b/convert.py index 9110f15806c6b..433dbe95eb1fd 100755 --- a/convert.py +++ b/convert.py @@ -779,7 +779,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc break yield result -def check_vocab_size(params: Params, vocab: Vocab) -> None: +def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: if params.n_vocab != vocab.vocab_size: assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) if params.n_vocab == vocab.vocab_size_base: @@ -787,12 +787,21 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None: vocab.added_tokens_list = [] vocab.vocab_size = vocab.vocab_size_base return + if pad_vocab and params.n_vocab > vocab.vocab_size: + pad_count = params.n_vocab - vocab.vocab_size + print(f'Padding vocab with {pad_count} token(s) - through ') + for i in range(1, (params.n_vocab - vocab.vocab_size) + 1): + vocab.added_tokens_list.append(f'') + vocab.vocab_size = params.n_vocab + return msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" if vocab.fname_added_tokens is not None: msg += f" combined with {vocab.fname_added_tokens}" msg += f" has {vocab.vocab_size})." if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None: msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." + if vocab.vocab_size < params.n_vocab: + msg += " Possibly try using the --padvocab option." raise Exception(msg) @@ -877,8 +886,12 @@ def close(self) -> None: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: - check_vocab_size(params, vocab) + def write_vocab_only( + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -905,8 +918,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: return dt.quantize(arr) @staticmethod - def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None: - check_vocab_size(params, vocab) + def write_all( + fname_out : Path, ftype: GGMLFileType, params: Params, + model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, + endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab : bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1126,6 +1145,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") args = parser.parse_args(args_in) if args.dump_single: @@ -1173,7 +1193,8 @@ def main(args_in: list[str] | None = None) -> None: load_merges = args.vocabtype == 'bpe', n_vocab = vocab.vocab_size) outfile = args.outfile - OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, + endianess = endianess, pad_vocab = args.padvocab) print(f"Wrote {outfile}") return @@ -1196,7 +1217,8 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab) print(f"Wrote {outfile}") diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index a2271d225d001..36d8a2ef84e7a 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -1023,6 +1023,35 @@ def __init__( def _load(self, path: Path) -> None: if not self._try_load_from_tokenizer_json(path): self._try_load_from_config_json(path) + if self.load_merges and len(self.merges) == 0: + self._try_load_merges_txt(path) + + def _try_load_merges_txt(self, path: Path) -> bool: + merges_file = path / 'merges.txt' + if not merges_file.is_file(): + return False + with open(merges_file, 'r') as fp: + first_line = next(fp, '').strip() + if not first_line.startswith('#'): + fp.seek(0) + line_num = 0 + else: + line_num = 1 + merges = [] + for line in fp: + line_num += 1 + line = line.strip() + if len(line) == 0: + continue + parts = line.split(None, 3) + if len(parts) != 2: + print(f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring', + file = sys.stderr) + continue + merges.append(f'{parts[0]} {parts[1]}') + self.merges = merges + return True + def _set_special_token(self, typ: str, tid: Any): if not isinstance(tid, int) or tid < 0: @@ -1083,6 +1112,9 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if not quiet: print(f'gguf: Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) + elif self.load_merges: + print('gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.', + file = sys.stderr) for typ, tokid in self.special_token_ids.items(): handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) if handler is None: diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index f0741a7c23e03..c6cb2c37a0e0a 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.4.5" +version = "0.4.6" description = "Write ML models in GGUF for GGML" authors = ["GGML "] packages = [