Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions cds_migrator_kit/rdm/records/transform/models/faser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM FASER model."""

from cds_migrator_kit.rdm.records.transform.models.base_publication_record import (
rdm_base_publication_model,
)
from cds_migrator_kit.transform.overdo import CdsOverdo


class FaserModel(CdsOverdo):
"""Translation model for FASER records."""

__query__ = (
"693__:FASER AND ("
"980__.a:NOTE OR "
"980__.a:CERNPHEPDRAFT OR "
"(980__.a:CONFERENCEPAPER OR 980__.a:SLIDE OR 980__.a:CONFERENCENOTE) OR "
"((980__.a:ARTICLE OR 980__.a:PREPRINT) -980__.a:CONFERENCEPAPER)"
")"
)

__ignore_keys__ = {
"0248_a",
"0248_p",
"0248_q",
"035__d", # oai harvest tag
"035__h", # oai harvest tag
"035__m", # oai harvest tag
"035__t", # oai harvest tag
"100__m", # email of contributor
"300__a", # number of pages
"500__9",
"520__9",
"540__3",
"542__3",
"700__m", # email of contributor
"773__p", # TODO: title of the related links: 2959848, 2949836. values: twiki, internal comments
"8564_8",
"8564_s",
"8564_x",
"8564_y", # file description - done by files dump
"916__y", # year of publication, redundant
"937__c", # last modified by
"937__s", # last modification date
"960__a", # base number
"961__c", # CDS modification tag # TODO
"961__h", # CDS modification tag # TODO
"961__l", # CDS modification tag # TODO
"961__x", # CDS modification tag # TODO
"981__a", # duplicate record id
"980__b", # additional article tag
# temporary ignored to check ep approval status. these are not in draft collections
"905__m",
"903__s",
"037__c",
"905__a",
"035__u",
"905__p",
"245__9",
"270__m",
"0247_9",
"773__z",
"700__v",
"540__f",
"700__j",
"542__h",
"0247_9",
"100__v",
"506__m",
"110__a",
"8564_z",
"9031_c",
"773__x",
"100__j",
"906__a",
"906__p",
"906__m",
"595_Ds",
"595_Dd",
"595_Da"
}

_default_fields = {
"custom_fields": {},
}


faser_model = FaserModel(
bases=(rdm_base_publication_model,),
entry_point_group="cds_migrator_kit.migrator.rules.faser",
)
148 changes: 148 additions & 0 deletions cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM FASER rules."""

from dojson.errors import IgnoreKey
from dojson.utils import for_each_value

from cds_migrator_kit.errors import UnexpectedValue
from cds_migrator_kit.rdm.records.transform.xml_processing.rules.base import urls
from cds_migrator_kit.rdm.records.transform.xml_processing.rules.publications import journal


from ...models.faser import faser_model as model


@model.over("collection", "^690C_", override=True)
@for_each_value
def collection(self, key, value):
"""Translates collection field."""
collection = value.get("a").strip()
if collection.lower() not in ["cern", "faser", "preprint", "scicom", "article", "publlhcc"]:
raise UnexpectedValue(subfield="a", field=key, value=value)
subjects = self.get("subjects", [])
subjects.append(
{
"subject": f"collection:{collection}",
}
)
self["subjects"] = subjects
raise IgnoreKey("collection")


@model.over("resource_type", "^980__", override=True)
def resource_type(self, key, value):
"""Translates resource_type."""
value = value.get("a")
if value:
value = value.strip().lower()

map_ = {
"article": {"id": "publication-article"},
"preprint": {"id": "publication-preprint"},
"cernphepdraft": {"id": "publication-preprint"},
"scicommpubllhcc": {"id": "publication-article"},
"bookchapter": {"id": "publication-section"},
"faser_papers": {"id": "publication-article"},
"note": {"id": "publication-technicalnote"},
"conferencepaper": {"id": "publication-conferenceproceeding"},
"slide": {"id": "presentation"},
# TODO: is this correct?
"conferencenote": {"id": "publication-technicalnote"},
}
try:
return map_[value]
except KeyError:
raise UnexpectedValue("Unknown resource type (FASER)", field=key, value=value)


@model.over("status", "^591__")
@for_each_value
def status(self, key, value):
"""Translates status field."""
status = value.get("b").strip().lower()
# TODO: there is no other value, can we ignore?
if status not in ["draft", "approved"]:
raise UnexpectedValue(subfield="b", field=key, value=value)
raise IgnoreKey("status")


@model.over("restriction_access_notes", "^5061_")
@for_each_value
def restriction_access_notes(self, key, value):
"""Translates status field."""
restriction_access_notes = value.get("a", "").strip().lower()
# TODO: how to handle these values?
if restriction_access_notes and restriction_access_notes not in [
"faser",
"faser-preprint",
"faser-slide",
"faser-confnote",
"faser-confpaper",
]:
raise UnexpectedValue(subfield="a", field=key, value=value)
raise IgnoreKey("restriction_access_notes")


@model.over("related_ids", "(^773__)", override=True)
@for_each_value
def related_ids(self, key, value):
"""Translated related links."""
related_link = value.get("u", "")
m_value = value.get("m", "")
if m_value and m_value.strip().lower() != "publication":
raise UnexpectedValue(subfield="m", field=key, value=value)
if not related_link:
# TODO: this is not exists in draft records
custom_fields = journal(self, key, value)
self["custom_fields"] = custom_fields
raise IgnoreKey("related_ids")
# TODO: how to transform? https://cds.cern.ch/record/1452204/export/xm
# Transform like the base `urls` rule, we lose the title "p" it's okay?
rel_ids = urls(self, key, value)
if not rel_ids:
raise IgnoreKey("related_ids")
rel_id = rel_ids[0]
related_identifiers = self.get("related_identifiers", [])
if rel_id not in related_identifiers:
related_identifiers.append(rel_id)
self["related_identifiers"] = related_identifiers

raise IgnoreKey("related_ids")


# TODO: maybe it's better to separate models with draft/public records?
@model.over("ep_approval", "^9031_")
@for_each_value
def ep_approval(self, key, value):
"""Translates EP approval status."""
status = value.get("s", "").strip().lower()
submitted_by = value.get("f", "").strip().lower()
date = value.get("d", "").strip()

deadline = value.get("e", "").strip()
description = value.get("a", "").strip()
ep_report_number = value.get("b", "").strip()
stamp_info = value.get("g", "").strip()
doc_type = value.get("c", "").strip()
if status not in ["waiting", "approved"]:
raise UnexpectedValue(subfield="a", field=key, value=value)
return {
k: v
for k, v in {
"status": status,
"submitted_by": submitted_by,
"date": date,
"deadline": deadline,
"description": description,
"ep_report_number": ep_report_number,
"stamp_info": stamp_info,
"doc_type": doc_type,
}.items()
if v
}
23 changes: 23 additions & 0 deletions cds_migrator_kit/rdm/streams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,26 @@ records:
missing_users: cds_migrator_kit/rdm/data/users
communities_ids:
- ""
faser_drafts:
data_dir: cds_migrator_kit/rdm/data/faser-drafts
tmp_dir: cds_migrator_kit/rdm/tmp/faser-drafts
log_dir: cds_migrator_kit/rdm/log/faser-drafts
restricted: "True"
extract:
dirpath: cds_migrator_kit/rdm/data/faser-drafts/dump/
transform:
files_dump_dir: cds_migrator_kit/rdm/data/faser-drafts/files/
missing_users: cds_migrator_kit/rdm/data/users
communities_ids:
- "3f8173e9-bb14-4d24-b199-83442b5d0ccc"
faser:
data_dir: cds_migrator_kit/rdm/data/faser
tmp_dir: cds_migrator_kit/rdm/tmp/faser
log_dir: cds_migrator_kit/rdm/log/faser
extract:
dirpath: /Users/zubeydecivelek/Desktop/RDM-Migration/faser
transform:
files_dump_dir: cds_migrator_kit/rdm/data/faser/files/
missing_users: cds_migrator_kit/rdm/data/users
communities_ids:
- "3f8173e9-bb14-4d24-b199-83442b5d0ccc"
6 changes: 6 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ cds_migrator_kit.migrator.models =
en = cds_migrator_kit.rdm.records.transform.models.en:en_model
annual_rep = cds_migrator_kit.rdm.records.transform.models.annual_report:annual_rep_model
fap = cds_migrator_kit.rdm.records.transform.models.fap:fap_model
faser = cds_migrator_kit.rdm.records.transform.models.faser:faser_model
cds_migrator_kit.migrator.rules.base =
base = cds_migrator_kit.transform.xml_processing.rules.base
cds_migrator_kit.migrator.rdm.rules.base =
Expand Down Expand Up @@ -164,6 +165,11 @@ cds_migrator_kit.migrator.rules.fap =
base = cds_migrator_kit.transform.xml_processing.rules.base
base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
fap = cds_migrator_kit.rdm.records.transform.xml_processing.rules.fap
cds_migrator_kit.migrator.rules.faser =
base = cds_migrator_kit.transform.xml_processing.rules.base
base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
publication = cds_migrator_kit.rdm.records.transform.xml_processing.rules.publications
faser = cds_migrator_kit.rdm.records.transform.xml_processing.rules.faser
cds_migrator_kit.migrator.rules.people =
people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people
invenio_pidstore.minters =
Expand Down
Loading