From 4c33d1fc36ef522ffb4d08ae2e940de2f44e442c Mon Sep 17 00:00:00 2001 From: stephantul Date: Fri, 3 Apr 2026 12:51:03 +0200 Subject: [PATCH 1/2] fix: multiword broken --- model2vec/distill/distillation.py | 6 ------ model2vec/tokenizer/tokenizer.py | 19 +++++++++++++++++-- tests/test_distillation.py | 7 +++++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py index f6aadd1..a6df4cb 100644 --- a/model2vec/distill/distillation.py +++ b/model2vec/distill/distillation.py @@ -88,12 +88,6 @@ def distill_from_model( # Create the vocabulary in the new tokenizer. tokenizer_model = clean_and_create_vocabulary(tokenizer_model, vocabulary, token_remove_regex=token_remove_regex) - # Remove the post processor, this is not necessary. - tokenizer_model.post_processor = None - # Prune again now that the post processor is gone. - # We can't do this before because we need the post processor and associated - # tokens before to add eos/bos. - tokenizer_model = tokenizer_model.prune_added_tokens() # All tokens in a single list. all_tokens = tokenizer_model.sorted_vocabulary diff --git a/model2vec/tokenizer/tokenizer.py b/model2vec/tokenizer/tokenizer.py index cfb3f83..0d4f394 100644 --- a/model2vec/tokenizer/tokenizer.py +++ b/model2vec/tokenizer/tokenizer.py @@ -13,7 +13,18 @@ def clean_and_create_vocabulary( vocabulary_to_add: list[str], token_remove_regex: re.Pattern[str] | None, ) -> TokenizerModel: - """Clean a vocabulary by removing duplicates and tokens that were already in the vocabulary.""" + """ + Clean a vocabulary by removing duplicates and tokens that were already in the vocabulary. + + This function removes duplicate tokens and tokens that are already in the model's vocabulary. + It also removes the tokenizer's post-processor, which we do not use anyway. + + :param model: The tokenizer model to clean. + :param vocabulary_to_add: The vocabulary to add to the model. Any tokens in this vocabulary that + are split according to the pretokenizer are added as multiword tokens. + :param token_remove_regex: A regex pattern to remove tokens from the vocabulary. + :return: The cleaned tokenizer model. + """ seen_tokens = set() n_duplicate = 0 @@ -39,7 +50,9 @@ def clean_and_create_vocabulary( if len(preprocessed) > 1: tokens_as_str = [f"'{subword}'" for subword in preprocessed] split_into = ",".join(tokens_as_str) - logger.warning(f"Token '{token}' was split into multiple tokens after preprocessing: [{split_into}]") + logger.warning( + f"Token '{token}' was split into multiple tokens after preprocessing: [{split_into}], adding it as a multi-word token." + ) added_tokens_to_add.append(token) continue token = preprocessed[0] @@ -54,6 +67,8 @@ def clean_and_create_vocabulary( seen_tokens.add(token) tokens_to_add.append(token) + model.post_processor = None + model = model.prune_added_tokens() model = model.add_tokens_to_vocabulary(tokens_to_add, preprocess_tokens=True) model = model.add_addedtokens(added_tokens_to_add, is_special=False, single_word=False, normalized=True) diff --git a/tests/test_distillation.py b/tests/test_distillation.py index e4feb4b..7aea79d 100644 --- a/tests/test_distillation.py +++ b/tests/test_distillation.py @@ -37,6 +37,7 @@ (None, 1024, None), # Subword, PCA set high, SIF off (None, None, 1e-4), # No PCA, SIF on (None, 0.9, 1e-4), # PCA as float (variance), SIF on + (["star wars"], 8, None), # Multiword vocabulary ], ) @patch.object(import_module("model2vec.distill.distillation"), "model_info") @@ -79,6 +80,12 @@ def test_distill_from_model( assert json.loads(static_model.tokenizer.to_str()) == json.loads(static_model2.tokenizer.to_str()) assert static_model.base_model_name == static_model2.base_model_name + for token in vocabulary or []: + # Normalized tokens are for single-word tokens. + # Other tokens are added as addedtokens, as is. + normalized = static_model.tokenizer.normalizer.normalize_str(token) + assert token in static_model.tokens or normalized in static_model.tokens + @patch.object(import_module("model2vec.distill.distillation"), "model_info") @patch("transformers.AutoModel.from_pretrained") From 0735780e2bab6f9149c89e3bffaac76919c5fdf4 Mon Sep 17 00:00:00 2001 From: stephantul Date: Fri, 3 Apr 2026 13:49:00 +0200 Subject: [PATCH 2/2] increase version --- model2vec/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model2vec/version.py b/model2vec/version.py index d93a8e5..1f7c8cd 100644 --- a/model2vec/version.py +++ b/model2vec/version.py @@ -1,2 +1,2 @@ -__version_triple__ = (0, 8, 1) +__version_triple__ = (0, 8, 2) __version__ = ".".join(map(str, __version_triple__))