-
Notifications
You must be signed in to change notification settings - Fork 13
build(medcat and medcat-den): CU-869ddh1jv Avoid test resources in releases #503
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
807f8f6
f9d11f0
64f4fd7
befa9f8
23ba05b
a6b960c
7578151
e0241b9
204916c
158ae94
679514c
370afae
1efc53a
fe6dd1f
c282dca
54d47ad
f3f331f
95fbd70
4a623e9
5391667
9eb5594
3dde906
5f21cab
18641a5
d0d2058
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| # .github/workflows/upload-test-models.yml | ||
| name: Upload test models to release | ||
|
|
||
| on: | ||
| workflow_call: | ||
| inputs: | ||
| tag: | ||
| required: true | ||
| type: string | ||
|
|
||
| jobs: | ||
| upload-test-models: | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - uses: actions/checkout@v4 | ||
|
|
||
| - name: Upload test models to release | ||
| env: | ||
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
| run: | | ||
| gh release upload ${{ inputs.tag }} medcat-test-models/* --clobber | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,7 @@ dev = [ | |
| "ruff", | ||
| "mypy", | ||
| "diskcache-stubs", | ||
| "pooch", | ||
| ] | ||
|
|
||
| [project.urls] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| # NOTE: this file is designed to be copied across the following sub-folders | ||
| # 1. medcat-v2/tests/resource_fetch.py | ||
| # 2. medcat-den/tests/resource_fetch.py | ||
| # So if you make changes here, copy them over to the others as well. | ||
| # | ||
| # NB! This does mean we have duplicate code. But to me the alternatives | ||
| # are note better: | ||
| # a) keep and install a separate local project - not portable | ||
| # b) publish and install from PyPI - extra maintenance burden | ||
|
|
||
|
|
||
| import os | ||
| import pooch | ||
| import importlib | ||
| from enum import Enum | ||
|
|
||
|
|
||
| _REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) | ||
| _CENTRAL_RESOURCES = os.path.join(_REPO_ROOT, 'medcat-test-models') | ||
|
|
||
| class DefinedResource(Enum): | ||
| v1_model = "mct_v1_model_pack.zip" | ||
| v2_model = "mct2_model_pack.zip" | ||
|
|
||
|
|
||
| def _get_version(project_name: str = 'medcat') -> str: | ||
| # NOTE: plan to use this for medcat-den as well | ||
| try: | ||
| pkg = importlib.import_module(project_name) | ||
| ver = getattr(pkg, '__version__') | ||
| if ver is None: | ||
| raise | ||
| return "%2F".join((project_name, f"v{ver}")) | ||
| except ImportError: | ||
| raise RuntimeError( | ||
| f"Could not determine version for '{project_name}'. " | ||
| f"Is the package installed?" | ||
| ) | ||
|
|
||
|
|
||
| def _download_resource(version: str, relative_path: str) -> str: | ||
| url = f"https://github.com/CogStack/cogstack-nlp/releases/download/{version}/{relative_path}" | ||
| try: | ||
| return pooch.retrieve( | ||
| url=url, | ||
| known_hash=None, | ||
| path=pooch.os_cache('medcat_tests'), | ||
| fname=relative_path, | ||
| ) | ||
| except Exception as e: | ||
| raise FileNotFoundError( | ||
| f"Test resource '{relative_path}' not found locally in '{_CENTRAL_RESOURCES}' " | ||
| f"and could not be fetched from release {version!r}. " | ||
| f"If developing locally, ensure 'medcat-test-models/' exists at the repo root. " | ||
| f"Original error: {e}" | ||
| ) from e | ||
|
|
||
|
|
||
| def get_resource(relative_path: str | DefinedResource, project_name: str = 'medcat') -> str: | ||
| """ | ||
| Returns a local path to the requested test resource. | ||
| Prefers the central repo location (medcat-test-models/) if available, | ||
| falls back to downloading from the corresponding release via pooch. | ||
| """ | ||
| # allow passing string version of defined resoure (e.g v1_model) | ||
| try: | ||
| relative_path = DefinedResource[relative_path] | ||
| except KeyError: | ||
| pass # treat as a literal path | ||
| if isinstance(relative_path, DefinedResource): | ||
| relative_path = relative_path.value | ||
| central_path = os.path.join(_CENTRAL_RESOURCES, relative_path) | ||
|
|
||
| if os.path.exists(central_path): | ||
| return central_path | ||
|
|
||
| version = _get_version(project_name) | ||
| return _download_resource(version, relative_path) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,6 +86,7 @@ dev = [ | |
| "types-tqdm", | ||
| "types-setuptools", | ||
| "types-PyYAML", | ||
| "pooch", | ||
| ] | ||
| spacy = [ | ||
| "spacy", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| # NOTE: this file is designed to be copied across the following sub-folders | ||
| # 1. medcat-v2/tests/resource_fetch.py | ||
| # 2. medcat-den/tests/resource_fetch.py | ||
| # So if you make changes here, copy them over to the others as well. | ||
| # | ||
| # NB! This does mean we have duplicate code. But to me the alternatives | ||
| # are note better: | ||
| # a) keep and install a separate local project - not portable | ||
| # b) publish and install from PyPI - extra maintenance burden | ||
|
|
||
|
|
||
| import os | ||
| import pooch | ||
| import importlib | ||
| from enum import Enum | ||
|
|
||
|
|
||
| _REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) | ||
| _CENTRAL_RESOURCES = os.path.join(_REPO_ROOT, 'medcat-test-models') | ||
|
|
||
| class DefinedResource(Enum): | ||
| v1_model = "mct_v1_model_pack.zip" | ||
| v2_model = "mct2_model_pack.zip" | ||
|
|
||
|
|
||
| def _get_version(project_name: str = 'medcat') -> str: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As a dumber option as it's all hardcoded anyway - could we just pin an exact stable version instead of using _get_version? I'd probably rather always pull Right now it kind of looks like the medcat version will change the test model used, but I'd rather make it a deliberate change. I really dont see the test models changing any time soon
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see what you're saying. And I agree that these files are unlikely to change frequently. But I do think it's realistic that they may be added to at some point. Hard-coding the version here would then mean we'd need to bump the version every time we do a release. |
||
| # NOTE: plan to use this for medcat-den as well | ||
| try: | ||
| pkg = importlib.import_module(project_name) | ||
| ver = getattr(pkg, '__version__') | ||
| if ver is None: | ||
| raise | ||
| return "%2F".join((project_name, f"v{ver}")) | ||
| except ImportError: | ||
| raise RuntimeError( | ||
| f"Could not determine version for '{project_name}'. " | ||
| f"Is the package installed?" | ||
| ) | ||
|
|
||
|
|
||
| def _download_resource(version: str, relative_path: str) -> str: | ||
| url = f"https://github.com/CogStack/cogstack-nlp/releases/download/{version}/{relative_path}" | ||
| try: | ||
| return pooch.retrieve( | ||
| url=url, | ||
| known_hash=None, | ||
| path=pooch.os_cache('medcat_tests'), | ||
| fname=relative_path, | ||
| ) | ||
| except Exception as e: | ||
| raise FileNotFoundError( | ||
| f"Test resource '{relative_path}' not found locally in '{_CENTRAL_RESOURCES}' " | ||
| f"and could not be fetched from release {version!r}. " | ||
| f"If developing locally, ensure 'medcat-test-models/' exists at the repo root. " | ||
| f"Original error: {e}" | ||
| ) from e | ||
|
|
||
|
|
||
| def get_resource(relative_path: str | DefinedResource, project_name: str = 'medcat') -> str: | ||
| """ | ||
| Returns a local path to the requested test resource. | ||
| Prefers the central repo location (medcat-test-models/) if available, | ||
| falls back to downloading from the corresponding release via pooch. | ||
| """ | ||
| # allow passing string version of defined resoure (e.g v1_model) | ||
| try: | ||
| relative_path = DefinedResource[relative_path] | ||
| except KeyError: | ||
| pass # treat as a literal path | ||
| if isinstance(relative_path, DefinedResource): | ||
| relative_path = relative_path.value | ||
| central_path = os.path.join(_CENTRAL_RESOURCES, relative_path) | ||
|
|
||
| if os.path.exists(central_path): | ||
| return central_path | ||
|
|
||
| version = _get_version(project_name) | ||
| return _download_resource(version, relative_path) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Along with the pin version comment -
Potentially we could do a release/tag for "medcat-test-models" separately, and test models from the medcat/medcat-den versions/releases? (Basically what you did with the release bundle idea).
With the assumption that the test models basically never change. Main advantage is it would probably be easier to reason about "medcat 3 is failing test X, but I know the test model zip is still version 1 and that exact file hasn't changed for a year", but I do see the advantage of versioning it all together as well, I'm sure you've already thought about this one! Definitely hard if the zip in source code changes but the released one doesnt...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we end up using this for more different parts it may make sense to centralise the entire logic into a installable package.
I think the main thing I want to avoid is yet another release cadence. As much as it's unlikely we'll need to bump this often, having these not tied to the releases of the specific tools that use them would (at least in my eyes) make it harder to track what version of which tool should work with what version of these test time models.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cool - yeah that's fair for sure that it gets harder to track. I'm all good with this as you have it!