From ad9f970c9f8e079bf444f3d3c7251f5bdf4872b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Mon, 18 May 2026 15:32:06 +0200 Subject: [PATCH] add(faser-draft): new model and transformation rules --- .../rdm/records/transform/models/faser.py | 97 ++++++++++++ .../transform/xml_processing/rules/faser.py | 148 ++++++++++++++++++ cds_migrator_kit/rdm/streams.yaml | 23 +++ setup.cfg | 6 + 4 files changed, 274 insertions(+) create mode 100644 cds_migrator_kit/rdm/records/transform/models/faser.py create mode 100644 cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser.py diff --git a/cds_migrator_kit/rdm/records/transform/models/faser.py b/cds_migrator_kit/rdm/records/transform/models/faser.py new file mode 100644 index 00000000..69994096 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/faser.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM FASER model.""" + +from cds_migrator_kit.rdm.records.transform.models.base_publication_record import ( + rdm_base_publication_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class FaserModel(CdsOverdo): + """Translation model for FASER records.""" + + __query__ = ( + "693__:FASER AND (" + "980__.a:NOTE OR " + "980__.a:CERNPHEPDRAFT OR " + "(980__.a:CONFERENCEPAPER OR 980__.a:SLIDE OR 980__.a:CONFERENCENOTE) OR " + "((980__.a:ARTICLE OR 980__.a:PREPRINT) -980__.a:CONFERENCEPAPER)" + ")" + ) + + __ignore_keys__ = { + "0248_a", + "0248_p", + "0248_q", + "035__d", # oai harvest tag + "035__h", # oai harvest tag + "035__m", # oai harvest tag + "035__t", # oai harvest tag + "100__m", # email of contributor + "300__a", # number of pages + "500__9", + "520__9", + "540__3", + "542__3", + "700__m", # email of contributor + "773__p", # TODO: title of the related links: 2959848, 2949836. values: twiki, internal comments + "8564_8", + "8564_s", + "8564_x", + "8564_y", # file description - done by files dump + "916__y", # year of publication, redundant + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__c", # CDS modification tag # TODO + "961__h", # CDS modification tag # TODO + "961__l", # CDS modification tag # TODO + "961__x", # CDS modification tag # TODO + "981__a", # duplicate record id + "980__b", # additional article tag + # temporary ignored to check ep approval status. these are not in draft collections + "905__m", + "903__s", + "037__c", + "905__a", + "035__u", + "905__p", + "245__9", + "270__m", + "0247_9", + "773__z", + "700__v", + "540__f", + "700__j", + "542__h", + "0247_9", + "100__v", + "506__m", + "110__a", + "8564_z", + "9031_c", + "773__x", + "100__j", + "906__a", + "906__p", + "906__m", + "595_Ds", + "595_Dd", + "595_Da" + } + + _default_fields = { + "custom_fields": {}, + } + + +faser_model = FaserModel( + bases=(rdm_base_publication_model,), + entry_point_group="cds_migrator_kit.migrator.rules.faser", +) diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser.py new file mode 100644 index 00000000..fd06f9b7 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM FASER rules.""" + +from dojson.errors import IgnoreKey +from dojson.utils import for_each_value + +from cds_migrator_kit.errors import UnexpectedValue +from cds_migrator_kit.rdm.records.transform.xml_processing.rules.base import urls +from cds_migrator_kit.rdm.records.transform.xml_processing.rules.publications import journal + + +from ...models.faser import faser_model as model + + +@model.over("collection", "^690C_", override=True) +@for_each_value +def collection(self, key, value): + """Translates collection field.""" + collection = value.get("a").strip() + if collection.lower() not in ["cern", "faser", "preprint", "scicom", "article", "publlhcc"]: + raise UnexpectedValue(subfield="a", field=key, value=value) + subjects = self.get("subjects", []) + subjects.append( + { + "subject": f"collection:{collection}", + } + ) + self["subjects"] = subjects + raise IgnoreKey("collection") + + +@model.over("resource_type", "^980__", override=True) +def resource_type(self, key, value): + """Translates resource_type.""" + value = value.get("a") + if value: + value = value.strip().lower() + + map_ = { + "article": {"id": "publication-article"}, + "preprint": {"id": "publication-preprint"}, + "cernphepdraft": {"id": "publication-preprint"}, + "scicommpubllhcc": {"id": "publication-article"}, + "bookchapter": {"id": "publication-section"}, + "faser_papers": {"id": "publication-article"}, + "note": {"id": "publication-technicalnote"}, + "conferencepaper": {"id": "publication-conferenceproceeding"}, + "slide": {"id": "presentation"}, + # TODO: is this correct? + "conferencenote": {"id": "publication-technicalnote"}, + } + try: + return map_[value] + except KeyError: + raise UnexpectedValue("Unknown resource type (FASER)", field=key, value=value) + + +@model.over("status", "^591__") +@for_each_value +def status(self, key, value): + """Translates status field.""" + status = value.get("b").strip().lower() + # TODO: there is no other value, can we ignore? + if status not in ["draft", "approved"]: + raise UnexpectedValue(subfield="b", field=key, value=value) + raise IgnoreKey("status") + + +@model.over("restriction_access_notes", "^5061_") +@for_each_value +def restriction_access_notes(self, key, value): + """Translates status field.""" + restriction_access_notes = value.get("a", "").strip().lower() + # TODO: how to handle these values? + if restriction_access_notes and restriction_access_notes not in [ + "faser", + "faser-preprint", + "faser-slide", + "faser-confnote", + "faser-confpaper", + ]: + raise UnexpectedValue(subfield="a", field=key, value=value) + raise IgnoreKey("restriction_access_notes") + + +@model.over("related_ids", "(^773__)", override=True) +@for_each_value +def related_ids(self, key, value): + """Translated related links.""" + related_link = value.get("u", "") + m_value = value.get("m", "") + if m_value and m_value.strip().lower() != "publication": + raise UnexpectedValue(subfield="m", field=key, value=value) + if not related_link: + # TODO: this is not exists in draft records + custom_fields = journal(self, key, value) + self["custom_fields"] = custom_fields + raise IgnoreKey("related_ids") + # TODO: how to transform? https://cds.cern.ch/record/1452204/export/xm + # Transform like the base `urls` rule, we lose the title "p" it's okay? + rel_ids = urls(self, key, value) + if not rel_ids: + raise IgnoreKey("related_ids") + rel_id = rel_ids[0] + related_identifiers = self.get("related_identifiers", []) + if rel_id not in related_identifiers: + related_identifiers.append(rel_id) + self["related_identifiers"] = related_identifiers + + raise IgnoreKey("related_ids") + + +# TODO: maybe it's better to separate models with draft/public records? +@model.over("ep_approval", "^9031_") +@for_each_value +def ep_approval(self, key, value): + """Translates EP approval status.""" + status = value.get("s", "").strip().lower() + submitted_by = value.get("f", "").strip().lower() + date = value.get("d", "").strip() + + deadline = value.get("e", "").strip() + description = value.get("a", "").strip() + ep_report_number = value.get("b", "").strip() + stamp_info = value.get("g", "").strip() + doc_type = value.get("c", "").strip() + if status not in ["waiting", "approved"]: + raise UnexpectedValue(subfield="a", field=key, value=value) + return { + k: v + for k, v in { + "status": status, + "submitted_by": submitted_by, + "date": date, + "deadline": deadline, + "description": description, + "ep_report_number": ep_report_number, + "stamp_info": stamp_info, + "doc_type": doc_type, + }.items() + if v + } \ No newline at end of file diff --git a/cds_migrator_kit/rdm/streams.yaml b/cds_migrator_kit/rdm/streams.yaml index 87e52a0a..bb0ac4bb 100644 --- a/cds_migrator_kit/rdm/streams.yaml +++ b/cds_migrator_kit/rdm/streams.yaml @@ -122,3 +122,26 @@ records: missing_users: cds_migrator_kit/rdm/data/users communities_ids: - "" + faser_drafts: + data_dir: cds_migrator_kit/rdm/data/faser-drafts + tmp_dir: cds_migrator_kit/rdm/tmp/faser-drafts + log_dir: cds_migrator_kit/rdm/log/faser-drafts + restricted: "True" + extract: + dirpath: cds_migrator_kit/rdm/data/faser-drafts/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/faser-drafts/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "3f8173e9-bb14-4d24-b199-83442b5d0ccc" + faser: + data_dir: cds_migrator_kit/rdm/data/faser + tmp_dir: cds_migrator_kit/rdm/tmp/faser + log_dir: cds_migrator_kit/rdm/log/faser + extract: + dirpath: /Users/zubeydecivelek/Desktop/RDM-Migration/faser + transform: + files_dump_dir: cds_migrator_kit/rdm/data/faser/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "3f8173e9-bb14-4d24-b199-83442b5d0ccc" diff --git a/setup.cfg b/setup.cfg index 493dceca..fd640168 100644 --- a/setup.cfg +++ b/setup.cfg @@ -83,6 +83,7 @@ cds_migrator_kit.migrator.models = en = cds_migrator_kit.rdm.records.transform.models.en:en_model annual_rep = cds_migrator_kit.rdm.records.transform.models.annual_report:annual_rep_model fap = cds_migrator_kit.rdm.records.transform.models.fap:fap_model + faser = cds_migrator_kit.rdm.records.transform.models.faser:faser_model cds_migrator_kit.migrator.rules.base = base = cds_migrator_kit.transform.xml_processing.rules.base cds_migrator_kit.migrator.rdm.rules.base = @@ -164,6 +165,11 @@ cds_migrator_kit.migrator.rules.fap = base = cds_migrator_kit.transform.xml_processing.rules.base base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base fap = cds_migrator_kit.rdm.records.transform.xml_processing.rules.fap +cds_migrator_kit.migrator.rules.faser = + base = cds_migrator_kit.transform.xml_processing.rules.base + base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base + publication = cds_migrator_kit.rdm.records.transform.xml_processing.rules.publications + faser = cds_migrator_kit.rdm.records.transform.xml_processing.rules.faser cds_migrator_kit.migrator.rules.people = people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people invenio_pidstore.minters =