diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..faef645 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,167 @@ +name: Release DataConnect Python Package + +on: + push: + branches: [ main ] + workflow_dispatch: + inputs: + version: + description: 'Version to release (leave empty for auto from pyproject.toml)' + required: false + default: '' + +concurrency: + group: ${{ github.event.pull_request.number || github.ref }}-release + cancel-in-progress: true + +jobs: + build-and-release: + runs-on: ubuntu-latest + + permissions: + contents: write + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + fetch-tags: true + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + + - name: Get package info + id: pkg-info + run: | + PKG_VERSION=$(grep '^version' pyproject.toml | head -1 | awk -F'"' '{print $2}') + PKG_NAME=$(grep '^name' pyproject.toml | head -1 | awk -F'"' '{print $2}') + echo "version=$PKG_VERSION" >> $GITHUB_OUTPUT + echo "name=$PKG_NAME" >> $GITHUB_OUTPUT + echo "Package: $PKG_NAME, Version: $PKG_VERSION" + + - name: Check existing tag + id: check_tag + run: | + if git rev-parse "v${{ steps.pkg-info.outputs.version }}" >/dev/null 2>&1; then + echo "tag_exists=true" >> $GITHUB_OUTPUT + echo "Tag v${{ steps.pkg-info.outputs.version }} already exists, will not create duplicate" + else + echo "tag_exists=false" >> $GITHUB_OUTPUT + echo "Tag v${{ steps.pkg-info.outputs.version }} does not exist yet" + fi + + - name: Build package + if: steps.check_tag.outputs.tag_exists != 'true' + run: poetry build + + - name: Find previous tag + id: find-previous-tag + if: steps.check_tag.outputs.tag_exists != 'true' + run: | + PREVIOUS_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "") + if [ -z "$PREVIOUS_TAG" ]; then + echo "No previous tag found - will generate changelog from all commits" + echo "previous_tag=" >> $GITHUB_OUTPUT + echo "from_ref=$(git rev-list --max-parents=0 HEAD)" >> $GITHUB_OUTPUT + else + echo "Previous tag found: $PREVIOUS_TAG" + echo "previous_tag=$PREVIOUS_TAG" >> $GITHUB_OUTPUT + echo "from_ref=$PREVIOUS_TAG" >> $GITHUB_OUTPUT + fi + + - name: Generate changelog + id: generate-changelog + if: steps.check_tag.outputs.tag_exists != 'true' + run: | + FROM_REF="${{ steps.find-previous-tag.outputs.from_ref }}" + echo "Generating changelog from $FROM_REF to HEAD..." + mkdir -p .github + + CURRENT_VERSION="${{ steps.pkg-info.outputs.version }}" + CURRENT_DATE=$(date +"%Y-%m-%d") + + BRANCH_NAME=$(git branch --show-current 2>/dev/null || echo "") + JIRA_TICKETS=$(echo "$BRANCH_NAME" | grep -o 'MCC-[0-9]*' | sort -u | tr '\n' ' ') + + echo "# Release Notes for v$CURRENT_VERSION ($CURRENT_DATE)" > .github/release-notes.md + echo "" >> .github/release-notes.md + + if [ ! -z "$JIRA_TICKETS" ]; then + echo "## Related Issues" >> .github/release-notes.md + for ticket in $JIRA_TICKETS; do + echo "- $ticket" >> .github/release-notes.md + done + echo "" >> .github/release-notes.md + fi + + git log $FROM_REF..HEAD --pretty=format:"%s" > .github/all-commits.txt + + extract_commits() { + local type=$1 + local emoji=$2 + local title=$3 + echo "## $emoji $title" > .github/section-$type.md + grep -E "^$type(\([^)]*\))?!?:" .github/all-commits.txt | \ + sed -E "s/^$type(\([^)]*\))?!?:\s*/- /" >> .github/section-$type.md || true + if [ $(wc -l < .github/section-$type.md) -gt 1 ]; then + cat .github/section-$type.md >> .github/release-notes.md + echo "" >> .github/release-notes.md + return 0 + else + return 1 + fi + } + + extract_commits "feat" "🚀" "Features" || true + extract_commits "fix" "🐛" "Bug Fixes" || true + extract_commits "perf" "⚡" "Performance Improvements" || true + extract_commits "refactor" "♻️" "Refactoring" || true + extract_commits "docs" "📚" "Documentation" || true + extract_commits "chore" "🔧" "Maintenance" || true + extract_commits "ci" "👷" "CI/CD" || true + + OTHER_COMMITS=$(grep -vE "^(feat|fix|perf|refactor|docs|chore|ci)(\([^)]*\))?!?:" \ + .github/all-commits.txt | grep -v "^Merge " || true) + if [ ! -z "$OTHER_COMMITS" ]; then + echo "## 📝 Other Changes" >> .github/release-notes.md + echo "$OTHER_COMMITS" | sed 's/^/- /' >> .github/release-notes.md + echo "" >> .github/release-notes.md + fi + + BREAKING=$(grep -E "^[a-z]+(\([^)]*\))?!:" .github/all-commits.txt || true) + if [ ! -z "$BREAKING" ]; then + echo "## ⚠️ Breaking Changes" >> .github/release-notes.md + echo "$BREAKING" | sed 's/^/- /' >> .github/release-notes.md + echo "" >> .github/release-notes.md + fi + + - name: Display release notes + if: steps.check_tag.outputs.tag_exists != 'true' + run: | + echo "--- Release Notes Content ---" + cat .github/release-notes.md || echo "No release notes were generated" + echo "-----------------------------" + + - name: Create Release + if: steps.check_tag.outputs.tag_exists != 'true' + uses: ncipollo/release-action@v1 + with: + name: "Release ${{ steps.pkg-info.outputs.version }}" + tag: "v${{ steps.pkg-info.outputs.version }}" + commit: ${{ github.sha }} + artifacts: "dist/*" + bodyFile: .github/release-notes.md + token: ${{ secrets.GITHUB_TOKEN }} + draft: false + prerelease: ${{ contains(steps.pkg-info.outputs.version, 'rc') || contains(steps.pkg-info.outputs.version, 'b') || contains(steps.pkg-info.outputs.version, 'a') }} + skipIfReleaseExists: true diff --git a/.gitignore b/.gitignore index 7b20724..d8628ed 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ __pycache__/ # Jupyter Notebook .ipynb_checkpoints *.ipynb +!dataconnect_usage.ipynb # Distribution / packaging .Python diff --git a/guides/dataconnect_quickstart.md b/guides/dataconnect_quickstart.md new file mode 100644 index 0000000..0d9f095 --- /dev/null +++ b/guides/dataconnect_quickstart.md @@ -0,0 +1,29 @@ +# DataConnect Python Library - Quick Start + +Instructions in this document apply only once all the steps of the [Setup document](dataconnect_setup.md) are followed, and you have the [Jupyter notebook](dataconnect_usage.ipynb) opened in your IDE where the Data Connect Python Library package was installed. + +*Note:* The `User Authentication Token` used to connect with DataConnect and make function calls can be generated from `iMedidata` > `Data Connect` > `Developer Center`. + +## Jupyter +* Make sure the Jupyter notebook being used points to the correct Python Virtual Environment. You can configure that by clicking on `Select Kernel` on the top right, and pick the `venv` that has `Python3.13` configured. +* In the Jupyter notebook, under *Preparation*, enter the user token from `Data Connect Developer Center`. +* Run all the code-cells until **Get all available studies** + * Feel free to enter a `search_study_name` wildcard value + * Confirm that the `get_studies()` call works. +* Continue running other code-cells in the notebook as desired. + +## Stand-alone code +* You may write your own Python files to access the Data Connect Python Library, but they must be in the same directory. +* Sample code: + + ```python + from uuid import UUID + from dataconnect import DataConnectClient + + with DataConnectClient.connect( + token="user-token-from-dataconnect", + ) as client: + result = client.get_studies(search_study_name="clin") + print(result.total_records) # total number of studies accessible to the user + print(result.studies) # list of Study objects + ``` diff --git a/guides/dataconnect_setup.md b/guides/dataconnect_setup.md new file mode 100644 index 0000000..903b524 --- /dev/null +++ b/guides/dataconnect_setup.md @@ -0,0 +1,46 @@ +# DataConnect Python Library Setup + +This document is intended for first-time end-users. + +## Prerequisites +### Environment +* Python 3.13 + * Should automatically include `pip` and `venv` (Python Virtual Environment) +* IDE of choice - `Visual Studio Code`, `PyCharm` etc. with `Jupyter` plugin + +### Credentials +* An iMedidata Account +* Access to **DataConnect**'s **Developer Center** and **Transformations** in iMedidata + +*Note:* You will need a generated `User Token` from **Developer Center** to make any function calls in this library. + +## Setup +* On your Terminal window of choice (`bash`, `zsh`, `iTerm`, `WSL`, `gitBash` etc), create a new directory and go to it. + + ```bash + mkdir dataconnect && cd $_ + ``` + +* Create a Python Virtual Environment and Activate it. Depending on your setup, you may use the `python` command instead of `python3`. + + ```bash + python3 -m venv ./.venv + source ./.venv/bin/activate + ``` + + * To confirm that the virtual environment has been created and activated, simply enter `which python3` (or `which python`) and it should point to the `dataconnect/.venv/bin` path. If not, run the above commands again. + + +* Run the following command to fetch the latest-released `dataconnect-library-python` package. In this example, version `1.0.0` is assumed to be the latest release: + + ```bash + pip install git+https://github.com/mdsol/dataconnect-library-python.git@1.0.0 + ``` + +* If there are no errors in fetching the package, open the directory in your IDE. For example, run this for VS Code: + + ```bash + code . + ``` + +* Download the usage jupyter file and open it in the same IDE window. diff --git a/guides/dataconnect_usage.ipynb b/guides/dataconnect_usage.ipynb new file mode 100644 index 0000000..c32c975 --- /dev/null +++ b/guides/dataconnect_usage.ipynb @@ -0,0 +1,640 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "909c5016", + "metadata": {}, + "source": [ + "# Data Connect Python Library - Usage\n", + "\n", + "***Version Note:***\n", + "This notebook reflects the *latest version* of the `dataconnect` Python library.\n", + "If you are using an older version, the functions or parameters described here may differ from your installed version.\n", + "\n", + "***View version-matched documentation:***\n", + "On GitHub, use the branch/tag selector to switch to your version's tag (e.g., `v0.1.0`)." + ] + }, + { + "cell_type": "markdown", + "id": "491849ae", + "metadata": {}, + "source": [ + "### Setup\n", + "Refer to the [Setup](dataconnect_setup.md) and [Quick Start](dataconnect_quickstart.md) documents first." + ] + }, + { + "cell_type": "markdown", + "id": "64e8d2c2", + "metadata": {}, + "source": [ + "### Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fe8bd656", + "metadata": {}, + "outputs": [], + "source": [ + "# We recommend storing user_token in a separate file or environment variable.\n", + "# This is the user authentication token generated from the Developer Center in iMedidata Platform.\n", + "\n", + "user_token = \"usertoken\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "40152cc2", + "metadata": {}, + "outputs": [], + "source": [ + "# Load prerequisite libraries\n", + "from uuid import UUID\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "62e5659ef27811b7", + "metadata": {}, + "source": [ + "### Initialize the connection to Data Connect\n", + "\n", + "_Note_: Do remember to close this connection after your operations, or use a context manager (`with` statement) to ensure proper resource management." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c60b75ab2e67426b", + "metadata": {}, + "outputs": [], + "source": [ + "from dataconnect import DataConnectClient\n", + "\n", + "dataconnect_client = DataConnectClient.connect(token=user_token)" + ] + }, + { + "cell_type": "markdown", + "id": "b4440c60", + "metadata": {}, + "source": [ + "### Get all available studies\n", + "\n", + "Get all available studies or search for a study by name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0708cee", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " study_result = dataconnect_client.get_studies(search_study_name=\"\")\n", + "\n", + " if study_result:\n", + " print(f\"\\n--- Total Studies: {study_result.total_records} ---\\n\")\n", + "\n", + " for study in study_result.studies:\n", + " envs_list = \", \".join(env.name for env in study.environments if env.name)\n", + " print(f\"• Study: {study.name}\")\n", + " if envs_list:\n", + " print(f\" Envs: {envs_list}\\n\")\n", + " else:\n", + " print(\" Envs: (none)\\n\")\n", + " else:\n", + " print(\"No Studies found.\")\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "07ac39b7", + "metadata": {}, + "source": [ + "### Work with datasets with `dataconnect` library\n", + "\n", + "#### Search datasets in a study environment\n", + "\n", + "Retrieve the study and study environment information from \"Developer info\" in iMedidata Platform." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd535ac", + "metadata": {}, + "outputs": [], + "source": [ + "pwb_study = {\n", + " \"study_id\": \"studyid\",\n", + " \"prod_env_id\": \"studyenvironmentid\",\n", + "}\n", + "\n", + "exciter_study = {\n", + " \"study_id\": \"studyid\",\n", + " \"prod_env_id\": \"studyenvironmentid\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "b357b826", + "metadata": {}, + "source": [ + "Set search keyword." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1de9ddb", + "metadata": {}, + "outputs": [], + "source": [ + "search_name = \"datasetname\"" + ] + }, + { + "cell_type": "markdown", + "id": "f900625f", + "metadata": {}, + "source": [ + "Search for a dataset in a specific study environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25c51a7d", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " datasets = dataconnect_client.get_datasets(\n", + " study_environment_uuid=UUID(exciter_study[\"prod_env_id\"]),\n", + " search_dataset_name=search_name,\n", + " page=1,\n", + " page_size=5,\n", + " )\n", + "\n", + " print(f\"Total datasets available across all pages: {datasets.total_records}\")\n", + " print(\n", + " f\"Page: {datasets.pagination.page}, \"\n", + " f\"Page size: {datasets.pagination.page_size}, \"\n", + " f\"Total pages: {datasets.pagination.total_pages}\"\n", + " )\n", + "\n", + " for dataset in datasets.items:\n", + " print(f\"\\n Dataset: {dataset.dataset_name}\")\n", + " print(f\" UUID: {dataset.dataset_uuid}\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7a1b4890", + "metadata": {}, + "source": [ + "#### Identify different versions of a specific dataset\n", + "\n", + "Prepare a list of datasets that are needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4727acb6", + "metadata": {}, + "outputs": [], + "source": [ + "# dataset_uuid values can be retrieved from get_datasets()\n", + "dataset_ids = {\n", + " \"pwb_der_id\": \"datasetid\",\n", + " \"pwb_import_id\": \"datasetid\",\n", + " \"pwb_rave_id\": \"datasetid\",\n", + " \"pwb_custom_id\": \"datasetid\",\n", + " \"exciter_der_id\": \"datasetid\",\n", + " \"exciter_import_id\": \"datasetid\",\n", + " \"exciter_rave_id\": \"datasetid\",\n", + " \"exciter_custom_id\": \"datasetid\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7985f2b9", + "metadata": {}, + "source": [ + "Retrieve other dataset versions if they are available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7f0fea7", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " dataset_versions = dataconnect_client.get_dataset_versions(dataset_uuid=UUID(dataset_ids[\"exciter_custom_id\"]))\n", + "\n", + " for version in dataset_versions:\n", + " print(f\" Dataset: {version.dataset_name}\")\n", + " print(f\" Version: {version.dataset_version}\")\n", + " print(f\" UUID: {version.dataset_uuid}\\n\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "fb0f253e", + "metadata": {}, + "source": [ + "#### Fetch records for specific datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dda6f0cc", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " rave_data = dataconnect_client.fetch_data(dataset_uuid=UUID(dataset_ids[\"exciter_rave_id\"]))\n", + " # Fetching first 3 rows of data\n", + " rave_data.head(3)\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c73ccece", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " der_data = dataconnect_client.fetch_data(dataset_uuid=UUID(dataset_ids[\"pwb_der_id\"]))\n", + " # Fetching the first 6 rows of data\n", + " der_data.head(6)\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7413a5f5", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import_data = dataconnect_client.fetch_data(dataset_uuid=UUID(dataset_ids[\"pwb_import_id\"]))\n", + " # Fetching the first 3 rows of data\n", + " import_data.head(3)\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09b7aca3", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " custom_data = dataconnect_client.fetch_data(dataset_uuid=UUID(dataset_ids[\"exciter_custom_id\"]))\n", + " # Fetching the first 3 rows of data\n", + " custom_data.head(3)\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d2322e61", + "metadata": {}, + "source": [ + "You can also use `first_n_rows` to limit the number of rows fetched, which is useful for large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b797d800ac0de9ef", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " # Fetch only the first 10 rows from the server\n", + " rave_sample = dataconnect_client.fetch_data(\n", + " dataset_uuid=UUID(dataset_ids[\"exciter_rave_id\"]),\n", + " first_n_rows=10,\n", + " )\n", + " rave_sample.head()\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7319083fd0e6ca5", + "metadata": {}, + "source": [ + "### Data Transformation Example Using pandas\n", + "\n", + "#### Data transformation using native pandas capability in Python\n", + "\n", + "The fetched data is returned as a pandas `DataFrame`. You can use any pandas operations\n", + "for data manipulation.\n", + "\n", + "If you have limited memory allocation, we recommend working on a limited set of\n", + "records to reduce development time, and then remove the record limit during\n", + "recurring execution.\n", + "\n", + "These functions are not provided by the `dataconnect` library. Below are just\n", + "examples of how common Python libraries can be used with the `dataconnect` library.\n", + "For details on how these libraries can be used, please consult the respective\n", + "library documentation.\n", + "\n", + "#### Set up: Using pandas to perform data transformation on fetched DataFrames" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bf1896ca482bfd9", + "metadata": {}, + "outputs": [], + "source": [ + "# Pivot der_data\n", + "pivot_df = der_data.rename(\n", + " columns={\n", + " \"HCT\": \"HCT_result\",\n", + " \"PLAT\": \"plat_result\",\n", + " \"WBC\": \"wbc_result\",\n", + " \"HCT_UNIT\": \"HCT_unit\",\n", + " \"PLAT_UNIT\": \"plat_unit\",\n", + " \"WBC_UNIT\": \"wbc_unit\",\n", + " }\n", + ").melt(\n", + " id_vars=[\"patient_id\", \"site_id\", \"LBTIM\"],\n", + " var_name=\"lab_test_value\",\n", + " value_name=\"value\",\n", + ")\n", + "\n", + "# Split the lab_test_value column into lab_test and measurement type\n", + "pivot_df[[\"lab_test\", \"measure\"]] = pivot_df[\"lab_test_value\"].str.rsplit(\"_\", n=1, expand=True)\n", + "pivot_df = pivot_df.pivot_table(\n", + " index=[\"patient_id\", \"site_id\", \"LBTIM\", \"lab_test\"],\n", + " columns=\"measure\",\n", + " values=\"value\",\n", + " aggfunc=\"first\",\n", + ").reset_index()\n", + "pivot_df = pivot_df.rename(columns={\"result\": \"lab_result\", \"unit\": \"lab_unit\"})\n", + "\n", + "# Union the datasets together (pivot_df, import_data)\n", + "pivot_df_renamed = pivot_df.rename(\n", + " columns={\n", + " \"patient_id\": \"subjid\",\n", + " \"site_id\": \"siteid\",\n", + " \"lab_test\": \"lbtest\",\n", + " \"lab_result\": \"lbresn\",\n", + " \"lab_unit\": \"lbresu\",\n", + " }\n", + ")\n", + "union_df = pd.concat([import_data, pivot_df_renamed], ignore_index=True)\n", + "\n", + "# Create publish_df by outer-joining import_data to the union_df key set\n", + "# (to include keys only present in derived data)\n", + "publish_df = import_data.merge(union_df[[\"subjid\", \"siteid\", \"visitnum\"]].drop_duplicates(), ...)\n", + "\n", + "publish_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "5132a8c3681180aa", + "metadata": {}, + "source": [ + "### Publish datasets with `dataconnect` library\n", + "\n", + "#### Set up the project token and publish parameters\n", + "\n", + "The `project_token` is a Base64-encoded token that identifies the target study, study environment, and project.\n", + "You can retrieve this from the \"Developer info\" in the iMedidata Platform." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe7af53732e8cf31", + "metadata": {}, + "outputs": [], + "source": [ + "project_token = \"projecttoken\"\n", + "\n", + "# Name for the dataset you want to publish\n", + "publish_dataset_name = \"my_published_dataset\"\n", + "\n", + "# Key columns define the unique key for deduplication\n", + "key_columns = [\"subjid\", \"siteid\", \"visitnum\"]\n", + "\n", + "# Source dataset UUIDs that the published dataset is derived from\n", + "source_dataset_uuids = [\n", + " UUID(dataset_ids[\"pwb_import_id\"]),\n", + " UUID(dataset_ids[\"pwb_der_id\"]),\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "665d73cc31b99b17", + "metadata": {}, + "source": [ + "#### Dry publish (validate without persisting)\n", + "\n", + "Before publishing, use `dry_publish` to validate your dataset against the server\n", + "without committing any changes. This lets you catch errors early." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23e2cabf3fead3bb", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " dry_result = dataconnect_client.dry_publish(\n", + " project_token=project_token,\n", + " dataset_name=publish_dataset_name,\n", + " key_columns=key_columns,\n", + " source_datasets=source_dataset_uuids,\n", + " data=publish_df,\n", + " )\n", + "\n", + " print(f\"Dry publish status: {dry_result.status}\")\n", + " print(f\"Schema valid: {dry_result.is_schema_valid}\")\n", + " print(f\"Config valid: {dry_result.is_config_valid}\")\n", + " print(f\"Dataset valid: {dry_result.is_dataset_valid}\")\n", + " print(f\"Dataset name: {dry_result.dataset_name}\")\n", + " print(f\"Dataset version: {dry_result.dataset_version}\")\n", + " print(f\"Number of columns: {dry_result.no_of_columns}\")\n", + " print(f\"Valid record count: {dry_result.valid_record_count}\")\n", + " print(f\"Duplicate record count: {dry_result.duplicate_record_count}\")\n", + " print(f\"Invalid record count: {dry_result.invalid_record_count}\")\n", + "\n", + " if dry_result.errors:\n", + " print(f\"\\nErrors: {dry_result.errors}\")\n", + "\n", + " if dry_result.invalid_datetime_formats:\n", + " print(f\"\\nInvalid datetime formats: {dry_result.invalid_datetime_formats}\")\n", + "\n", + " if dry_result.invalid_records is not None and not dry_result.invalid_records.empty:\n", + " print(\"\\nInvalid records:\")\n", + " print(dry_result.invalid_records.head())\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "599eddc1f46e945b", + "metadata": {}, + "source": [ + "#### Publish (persist the dataset)\n", + "\n", + "Once the dry publish validates successfully, publish the dataset to the server." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8429cd0eb7e74e1c", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " publish_result = dataconnect_client.publish(\n", + " project_token=project_token,\n", + " dataset_name=publish_dataset_name,\n", + " key_columns=key_columns,\n", + " source_datasets=source_dataset_uuids,\n", + " data=publish_df,\n", + " )\n", + "\n", + " print(f\"Publish status: {publish_result.status}\")\n", + " print(f\"Dataset name: {publish_result.dataset_name}\")\n", + " print(f\"Dataset UUID: {publish_result.dataset_uuid}\")\n", + " print(f\"Dataset version: {publish_result.dataset_version}\")\n", + " print(f\"Dataset batch number: {publish_result.dataset_batch_number}\")\n", + " print(f\"Valid record count: {publish_result.valid_record_count}\")\n", + " print(f\"Duplicate record count: {publish_result.duplicate_record_count}\")\n", + " print(f\"Invalid record count: {publish_result.invalid_record_count}\")\n", + "\n", + " if publish_result.invalid_records is not None and not publish_result.invalid_records.empty:\n", + " print(\"\\nInvalid records:\")\n", + " print(publish_result.invalid_records.head())\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "759eea057d0957b9", + "metadata": {}, + "source": [ + "You can also specify `datetime_formats` if your dataset contains datetime columns\n", + "that need a specific format for validation and publishing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb6d988fcf8854e5", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " publish_result_with_formats = dataconnect_client.publish(\n", + " project_token=project_token,\n", + " dataset_name=publish_dataset_name,\n", + " key_columns=key_columns,\n", + " source_datasets=source_dataset_uuids,\n", + " data=publish_df,\n", + " datetime_formats={\"visit_date\": \"yyyy-MM-dd\"},\n", + " )\n", + "\n", + " print(f\"Publish status: {publish_result_with_formats.status}\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "608fe2d6aa684bd2", + "metadata": {}, + "source": [ + "### Close the Client Connection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fce61d2f447c8e0", + "metadata": {}, + "outputs": [], + "source": [ + "if dataconnect_client:\n", + " dataconnect_client.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}