From cef6a81a7849659ddc324427b7b902c60e857924 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 28 May 2026 07:58:16 +0000 Subject: [PATCH] Automatically add _ftindex: tag at creator start based on indexing configuration --- CHANGELOG.md | 2 + src/zimscraperlib/zim/creator.py | 17 +++++++++ tests/zim/test_dedup.py | 2 +- tests/zim/test_zim_creator.py | 64 +++++++++++++++++++++++++++++++- 4 files changed, 82 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f91b56d..94e1a5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add support for custom number of retries and user-agent in save_large_file (#278) - Enhance save_large_file log level (#279) - Extend image optimization to support in-memory streams(BytesIO/bytes) and dst_format param (#289) +- Automatically add \_ftindex: tag at creator start based on indexing configuration (#295) + - **BEHAVIOR CHANGE**: it is not possible anymore to add "Tags" metadata with Creator.add_metadata method after creator has started ; this is anyway not recommended anymore since 5.x (one should prefer Creator.config_metadata) ### Fixed diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index a5fdd03..e1f010d 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -44,6 +44,7 @@ LanguageMetadata, MetadataBase, StandardMetadataList, + TagsMetadata, ) DUPLICATE_EXC_STR = re.compile( @@ -114,6 +115,7 @@ def __init__( super().__init__(filename=filename) self._metadata: dict[str, AnyMetadata] = {} self.__indexing_configured = False + self.__indexing_value: bool = False self.can_finish = True self.set_mainpath(main_path) @@ -142,6 +144,7 @@ def config_indexing( raise ValueError("Not a valid ISO-639-3 language code") super().config_indexing(indexing, language) self.__indexing_configured = True + self.__indexing_value = indexing return self def _log_metadata(self): @@ -223,6 +226,20 @@ def start(self): ) and not self.__indexing_configured: self.config_indexing(True, language) + ftindex_tag = f"_ftindex:{'yes' if self.__indexing_value else 'no'}" + tags_metadata = self._metadata.get(TagsMetadata.meta_name) + if isinstance(tags_metadata, TagsMetadata): + if not any( + re.sub(r"\s+", "", part).startswith("_ftindex:") + for tag in tags_metadata.value + for part in tag.split(";") + ): + tags_metadata.value.append(ftindex_tag) + logger.debug(f"Metadata: Tags has been altered with '{ftindex_tag}'") + else: + self._metadata[TagsMetadata.meta_name] = TagsMetadata([ftindex_tag]) + logger.debug(f"Metadata: Tags has been set with '{ftindex_tag}'") + super().__enter__() for metadata in self._metadata.values(): diff --git a/tests/zim/test_dedup.py b/tests/zim/test_dedup.py index 1f36554..21ddb04 100644 --- a/tests/zim/test_dedup.py +++ b/tests/zim/test_dedup.py @@ -76,7 +76,7 @@ def add_items(creator_or_deduplicator: Any): for zim_path in [fpath_with_dedup, fpath_without_dedup]: reader = Archive(zim_path) - assert reader.all_entry_count == 24 + assert reader.all_entry_count == 25 for html_path in [ "welcome1", diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index ad2bf7b..21b633d 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -119,7 +119,7 @@ def test_zim_creator( assert reader.get_text_metadata( "Language" ) == DEFAULT_DEV_ZIM_METADATA.Language.libzim_value.decode("UTF-8") - assert reader.get_text_metadata("Tags") == tags + assert reader.get_text_metadata("Tags") == f"{tags};_ftindex:yes" assert reader.main_entry.get_item().path == f"{main_path}" # make sure we have our image assert reader.get_item("images/yahoo.png") @@ -950,7 +950,7 @@ def test_metadata_extras_missing_prefix(tmp_path: pathlib.Path): DEFAULT_DEV_ZIM_METADATA.Title.libzim_value.decode() + "Foo", id="simple_str", ), - pytest.param("Tags", TagsMetadata(["tag1", "tag2"]), "tag1;tag2", id="tags"), + pytest.param("Source", SourceMetadata("asource"), "asource", id="source"), ], ) def test_add_metadata( @@ -973,3 +973,63 @@ def test_config_indexing(tmp_path: pathlib.Path): assert Creator(tmp_path / "_.zim", "").config_indexing(True, "bam") assert Creator(tmp_path / "_.zim", "").config_indexing(False, "bam") assert Creator(tmp_path / "_.zim", "").config_indexing(False) + + +@pytest.mark.parametrize( + "indexing, expected_tag", + [ + pytest.param(True, "_ftindex:yes", id="explicit_yes"), + pytest.param(False, "_ftindex:no", id="explicit_no"), + ], +) +def test_start_ftindex_tag_from_explicit_config_indexing( + tmp_path: pathlib.Path, *, indexing: bool, expected_tag: str +): + fpath = tmp_path / "test.zim" + with Creator(fpath, "").config_dev_metadata().config_indexing(indexing, "fra"): + pass + tags = Archive(fpath).get_text_metadata("Tags").split(";") + assert expected_tag in tags + assert len([t for t in tags if t.startswith("_ftindex:")]) == 1 + + +def test_start_ftindex_yes_when_auto_configured_via_language(tmp_path: pathlib.Path): + # DEFAULT_DEV_ZIM_METADATA has Language=fra but no Tags; language triggers + # auto config_indexing(True) which should result in _ftindex:yes being added. + fpath = tmp_path / "test.zim" + with Creator(fpath, "").config_dev_metadata(): + pass + tags = Archive(fpath).get_text_metadata("Tags").split(";") + assert "_ftindex:yes" in tags + + +def test_start_ftindex_appended_to_existing_tags(tmp_path: pathlib.Path): + fpath = tmp_path / "test.zim" + with Creator(fpath, "").config_dev_metadata( + TagsMetadata(["mytag", "_pictures:no"]) + ): + pass + tags = Archive(fpath).get_text_metadata("Tags").split(";") + assert "_ftindex:yes" in tags + assert "mytag" in tags + assert "_pictures:no" in tags + + +def test_start_ftindex_not_duplicated_when_already_set(tmp_path: pathlib.Path): + fpath = tmp_path / "test.zim" + with Creator(fpath, "").config_dev_metadata(TagsMetadata(["_ftindex:no"])): + pass + tags = Archive(fpath).get_text_metadata("Tags").split(";") + # _ftindex:no was set explicitly; auto-configured indexing=True should not override + assert tags.count("_ftindex:no") == 1 + assert "_ftindex:yes" not in tags + + +def test_start_ftindex_not_duplicated_when_set_with_spaces(tmp_path: pathlib.Path): + # " _ftindex : no" has spaces; after clean_str it becomes "_ftindex : no" + # The check must still recognise it and not add a second _ftindex tag. + fpath = tmp_path / "test.zim" + with Creator(fpath, "").config_dev_metadata(TagsMetadata([" _ftindex : no "])): + pass + tags = Archive(fpath).get_text_metadata("Tags").split(";") + assert sum(1 for t in tags if t.replace(" ", "").startswith("_ftindex:")) == 1