diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py index d3c48b6e25..4e16aa97d4 100644 --- a/src/packagedcode/__init__.py +++ b/src/packagedcode/__init__.py @@ -33,6 +33,7 @@ from packagedcode import opam from packagedcode import phpcomposer from packagedcode import pubspec +from packagedcode import publiccode from packagedcode import pypi from packagedcode import readme from packagedcode import rpm @@ -77,6 +78,8 @@ conda.CondaMetaYamlHandler, conda.CondaYamlHandler, + publiccode.PubliccodeYmlHandler, + conan.ConanFileHandler, conan.ConanDataHandler, diff --git a/src/packagedcode/publiccode.py b/src/packagedcode/publiccode.py new file mode 100644 index 0000000000..5a1d299da8 --- /dev/null +++ b/src/packagedcode/publiccode.py @@ -0,0 +1,151 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import io + +import saneyaml + +from packagedcode import models + +""" +Handle publiccode.yml metadata files. +publiccode.yml is a metadata standard for public sector open source software. +See https://github.com/publiccodeyml/publiccode.yml +""" + +EXTRA_DATA_KEYS = ( + 'publiccodeYmlVersion', + 'platforms', + 'developmentStatus', + 'softwareType', +) + + +class PubliccodeYmlHandler(models.DatafileHandler): + datasource_id = 'publiccode_yml' + path_patterns = ('*publiccode.yml', '*publiccode.yaml') + default_package_type = 'publiccode' + default_primary_language = None + description = 'publiccode.yml metadata file' + documentation_url = 'https://github.com/publiccodeyml/publiccode.yml' + + @classmethod + def parse(cls, location, package_only=False): + with io.open(location, encoding='utf-8') as loc: + data = saneyaml.load(loc.read()) + + if not is_publiccode_yml_data(data): + return + + package_data = dict( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=data.get('name'), + version=data.get('softwareVersion'), + vcs_url=data.get('url'), + homepage_url=data.get('landingURL') or data.get('url'), + description=get_description(data), + extracted_license_statement=get_extracted_license_statement(data), + copyright=get_copyright_statement(data), + keywords=get_categories(data), + parties=get_parties(data), + extra_data=get_extra_data(data) or None, + ) + yield models.PackageData.from_data(package_data, package_only) + + +def is_publiccode_yml_data(data): + return isinstance(data, dict) and 'publiccodeYmlVersion' in data + + +def get_description(data): + """ + Extract the best available description from publiccode.yml's + multilingual 'description' block. Prefer English, fall back to + any available language. Returns longDescription, else shortDescription. + """ + description_block = data.get('description') or {} + if not description_block: + return + + lang_data = None + for language, localized_description in description_block.items(): + primary_language = language.lower().split('-')[0] + if primary_language == 'en': + lang_data = localized_description + break + + if not lang_data: + lang_data = next(iter(description_block.values()), None) + + if not lang_data: + return + + long_desc = lang_data.get('longDescription', '').strip() + short_desc = lang_data.get('shortDescription', '').strip() + + return long_desc or short_desc or None + + +def get_extracted_license_statement(data): + legal = data.get('legal') or {} + return legal.get('license') + + +def get_copyright_statement(data): + legal = data.get('legal') or {} + copyright_holders = [] + + for key in ('mainCopyrightOwner', 'repoOwner'): + value = legal.get(key) + if value and value not in copyright_holders: + copyright_holders.append(value) + + return '\n'.join(copyright_holders) or None + + +def get_categories(data): + categories = data.get('categories') or [] + if isinstance(categories, str): + return [categories] + return categories + + +def get_parties(data): + parties = [] + maintenance = data.get('maintenance') or {} + + for contact in maintenance.get('contacts') or []: + contact_name = contact.get('name') + contact_email = contact.get('email') + + if not (contact_name or contact_email): + continue + + parties.append( + models.Party( + type=models.party_person, + name=contact_name, + email=contact_email, + role='maintainer', + ) + ) + + return parties + + +def get_extra_data(data): + extra_data = {} + + for key in EXTRA_DATA_KEYS: + value = data.get(key) + if value: + extra_data[key] = value + + return extra_data diff --git a/tests/packagedcode/data/publiccode/publiccode.yml b/tests/packagedcode/data/publiccode/publiccode.yml new file mode 100644 index 0000000000..d7b7a28e76 --- /dev/null +++ b/tests/packagedcode/data/publiccode/publiccode.yml @@ -0,0 +1,49 @@ +# Hand-crafted publiccode.yml test fixture based on examples from: +# https://github.com/publiccodeyml/publiccode.yml/blob/main/docs/standard/schema.core.rst +publiccodeYmlVersion: "0.4" + +name: Medusa +url: "https://example.com/italia/medusa.git" +landingURL: "https://example.com/medusa" +softwareVersion: "1.0.3" + +platforms: + - web + - linux + +categories: + - financial-reporting + - accounting + +developmentStatus: stable +softwareType: "standalone/desktop" + +description: + en: + shortDescription: > + A short description of this software. + longDescription: > + A very long description of this software. It explains what it does, + who it is for, and why you might want to use it in a public + administration context. + features: + - Feature one + - Feature two + +legal: + license: AGPL-3.0-or-later + mainCopyrightOwner: City of Example + repoOwner: City of Example + +maintenance: + type: "contract" + contacts: + - name: Francesco Rossi + email: f.rossi@example.com + affiliation: City of Example + +localisation: + localisationReady: true + availableLanguages: + - en + - it diff --git a/tests/packagedcode/data/publiccode/publiccode.yml-expected.json b/tests/packagedcode/data/publiccode/publiccode.yml-expected.json new file mode 100644 index 0000000000..f1c4811efd --- /dev/null +++ b/tests/packagedcode/data/publiccode/publiccode.yml-expected.json @@ -0,0 +1,88 @@ +[ + { + "type": "publiccode", + "namespace": null, + "name": "Medusa", + "version": "1.0.3", + "qualifiers": {}, + "subpath": null, + "primary_language": null, + "description": "A very long description of this software. It explains what it does, who it is for, and why you might want to use it in a public administration context.", + "release_date": null, + "parties": [ + { + "type": "person", + "role": "maintainer", + "name": "Francesco Rossi", + "email": "f.rossi@example.com", + "url": null + } + ], + "keywords": [ + "financial-reporting", + "accounting" + ], + "homepage_url": "https://example.com/medusa", + "download_url": null, + "size": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha512": null, + "bug_tracking_url": null, + "code_view_url": null, + "vcs_url": "https://example.com/italia/medusa.git", + "copyright": "City of Example", + "holder": "City of Example", + "declared_license_expression": "agpl-3.0-plus", + "declared_license_expression_spdx": "AGPL-3.0-or-later", + "license_detections": [ + { + "license_expression": "agpl-3.0-plus", + "license_expression_spdx": "AGPL-3.0-or-later", + "matches": [ + { + "license_expression": "agpl-3.0-plus", + "license_expression_spdx": "AGPL-3.0-or-later", + "from_file": null, + "start_line": 1, + "end_line": 1, + "matcher": "1-hash", + "score": 100.0, + "matched_length": 5, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "spdx_license_id_agpl-3.0-or-later_for_agpl-3.0-plus.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_agpl-3.0-or-later_for_agpl-3.0-plus.RULE", + "matched_text": "AGPL-3.0-or-later" + } + ], + "identifier": "agpl_3_0_plus-a0f62d44-7e99-852b-0b1c-0bc5e1c9f6d0" + } + ], + "other_license_expression": null, + "other_license_expression_spdx": null, + "other_license_detections": [], + "extracted_license_statement": "AGPL-3.0-or-later", + "notice_text": null, + "source_packages": [], + "file_references": [], + "is_private": false, + "is_virtual": false, + "extra_data": { + "publiccodeYmlVersion": "0.4", + "platforms": [ + "web", + "linux" + ], + "developmentStatus": "stable", + "softwareType": "standalone/desktop" + }, + "dependencies": [], + "repository_homepage_url": null, + "repository_download_url": null, + "api_data_url": null, + "datasource_id": "publiccode_yml", + "purl": "pkg:publiccode/Medusa@1.0.3" + } +] \ No newline at end of file diff --git a/tests/packagedcode/data/publiccode/publiccode.yml-scancode.json b/tests/packagedcode/data/publiccode/publiccode.yml-scancode.json new file mode 100644 index 0000000000..b5c9f8359e --- /dev/null +++ b/tests/packagedcode/data/publiccode/publiccode.yml-scancode.json @@ -0,0 +1,193 @@ +{ + "packages": [ + { + "type": "publiccode", + "namespace": null, + "name": "Medusa", + "version": "1.0.3", + "qualifiers": {}, + "subpath": null, + "primary_language": null, + "description": "A very long description of this software. It explains what it does, who it is for, and why you might want to use it in a public administration context.", + "release_date": null, + "parties": [ + { + "type": "person", + "role": "maintainer", + "name": "Francesco Rossi", + "email": "f.rossi@example.com", + "url": null + } + ], + "keywords": [ + "financial-reporting", + "accounting" + ], + "homepage_url": "https://example.com/medusa", + "download_url": null, + "size": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha512": null, + "bug_tracking_url": null, + "code_view_url": null, + "vcs_url": "https://example.com/italia/medusa.git", + "copyright": "City of Example", + "holder": "City of Example", + "declared_license_expression": "agpl-3.0-plus", + "declared_license_expression_spdx": "AGPL-3.0-or-later", + "license_detections": [ + { + "license_expression": "agpl-3.0-plus", + "license_expression_spdx": "AGPL-3.0-or-later", + "matches": [ + { + "license_expression": "agpl-3.0-plus", + "license_expression_spdx": "AGPL-3.0-or-later", + "from_file": "publiccode.yml", + "start_line": 1, + "end_line": 1, + "matcher": "1-hash", + "score": 100.0, + "matched_length": 5, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "spdx_license_id_agpl-3.0-or-later_for_agpl-3.0-plus.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_agpl-3.0-or-later_for_agpl-3.0-plus.RULE", + "matched_text": "AGPL-3.0-or-later" + } + ], + "identifier": "agpl_3_0_plus-a0f62d44-7e99-852b-0b1c-0bc5e1c9f6d0" + } + ], + "other_license_expression": null, + "other_license_expression_spdx": null, + "other_license_detections": [], + "extracted_license_statement": "AGPL-3.0-or-later", + "notice_text": null, + "source_packages": [], + "is_private": false, + "is_virtual": false, + "extra_data": { + "publiccodeYmlVersion": "0.4", + "platforms": [ + "web", + "linux" + ], + "developmentStatus": "stable", + "softwareType": "standalone/desktop" + }, + "repository_homepage_url": null, + "repository_download_url": null, + "api_data_url": null, + "package_uid": "pkg:publiccode/Medusa@1.0.3?uuid=fixed-uid-done-for-testing-5642512d1758", + "datafile_paths": [ + "publiccode.yml" + ], + "datasource_ids": [ + "publiccode_yml" + ], + "purl": "pkg:publiccode/Medusa@1.0.3" + } + ], + "dependencies": [], + "files": [ + { + "path": "publiccode.yml", + "type": "file", + "package_data": [ + { + "type": "publiccode", + "namespace": null, + "name": "Medusa", + "version": "1.0.3", + "qualifiers": {}, + "subpath": null, + "primary_language": null, + "description": "A very long description of this software. It explains what it does, who it is for, and why you might want to use it in a public administration context.", + "release_date": null, + "parties": [ + { + "type": "person", + "role": "maintainer", + "name": "Francesco Rossi", + "email": "f.rossi@example.com", + "url": null + } + ], + "keywords": [ + "financial-reporting", + "accounting" + ], + "homepage_url": "https://example.com/medusa", + "download_url": null, + "size": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha512": null, + "bug_tracking_url": null, + "code_view_url": null, + "vcs_url": "https://example.com/italia/medusa.git", + "copyright": "City of Example", + "holder": "City of Example", + "declared_license_expression": "agpl-3.0-plus", + "declared_license_expression_spdx": "AGPL-3.0-or-later", + "license_detections": [ + { + "license_expression": "agpl-3.0-plus", + "license_expression_spdx": "AGPL-3.0-or-later", + "matches": [ + { + "license_expression": "agpl-3.0-plus", + "license_expression_spdx": "AGPL-3.0-or-later", + "from_file": "publiccode.yml", + "start_line": 1, + "end_line": 1, + "matcher": "1-hash", + "score": 100.0, + "matched_length": 5, + "match_coverage": 100.0, + "rule_relevance": 100, + "rule_identifier": "spdx_license_id_agpl-3.0-or-later_for_agpl-3.0-plus.RULE", + "rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_agpl-3.0-or-later_for_agpl-3.0-plus.RULE", + "matched_text": "AGPL-3.0-or-later" + } + ], + "identifier": "agpl_3_0_plus-a0f62d44-7e99-852b-0b1c-0bc5e1c9f6d0" + } + ], + "other_license_expression": null, + "other_license_expression_spdx": null, + "other_license_detections": [], + "extracted_license_statement": "AGPL-3.0-or-later", + "notice_text": null, + "source_packages": [], + "file_references": [], + "is_private": false, + "is_virtual": false, + "extra_data": { + "publiccodeYmlVersion": "0.4", + "platforms": [ + "web", + "linux" + ], + "developmentStatus": "stable", + "softwareType": "standalone/desktop" + }, + "dependencies": [], + "repository_homepage_url": null, + "repository_download_url": null, + "api_data_url": null, + "datasource_id": "publiccode_yml", + "purl": "pkg:publiccode/Medusa@1.0.3" + } + ], + "for_packages": [ + "pkg:publiccode/Medusa@1.0.3?uuid=fixed-uid-done-for-testing-5642512d1758" + ], + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/packagedcode/test_publiccode.py b/tests/packagedcode/test_publiccode.py new file mode 100644 index 0000000000..c1b09a3b9f --- /dev/null +++ b/tests/packagedcode/test_publiccode.py @@ -0,0 +1,64 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# + +import os + +from packagedcode import publiccode +from packages_test_utils import PackageTester +from scancode.cli_test_utils import check_json_scan +from scancode.cli_test_utils import run_scan_click +from scancode_config import REGEN_TEST_FIXTURES + + +class TestPubliccode(PackageTester): + test_data_dir = os.path.join(os.path.dirname(__file__), 'data') + + def test_publiccode_yml_is_datafile(self): + test_file = self.get_test_loc('publiccode/publiccode.yml') + assert publiccode.PubliccodeYmlHandler.is_datafile(test_file) + + def test_parse_publiccode_yml(self): + test_file = self.get_test_loc('publiccode/publiccode.yml') + packages = publiccode.PubliccodeYmlHandler.parse(test_file) + expected_loc = self.get_test_loc( + 'publiccode/publiccode.yml-expected.json', + must_exist=False, + ) + self.check_packages_data( + packages_data=packages, + expected_loc=expected_loc, + must_exist=False, + regen=REGEN_TEST_FIXTURES, + ) + + def test_scan_cli_works(self): + test_file = self.get_test_loc('publiccode/publiccode.yml') + expected_file = self.get_test_loc( + 'publiccode/publiccode.yml-scancode.json', + must_exist=False, + ) + result_file = self.get_temp_file('results.json') + run_scan_click(['--package', test_file, '--json', result_file]) + check_json_scan( + expected_file=expected_file, + result_file=result_file, + remove_uuid=True, + regen=REGEN_TEST_FIXTURES, + ) + + def test_publiccode_yml_no_version_key_returns_nothing(self): + test_file = self.get_temp_file(extension='yml', file_name='publiccode') + with open(test_file, 'w') as temp_file: + temp_file.write('name: something\nversion: 1.0\n') + + packages = list(publiccode.PubliccodeYmlHandler.parse(test_file)) + assert packages == [] + + def test_publiccode_yml_path_patterns(self): + assert publiccode.PubliccodeYmlHandler.path_patterns == ( + '*publiccode.yml', + '*publiccode.yaml', + )