diff --git a/json-schema/exampleWorkflow.json b/json-schema/exampleWorkflow.json new file mode 100644 index 00000000..ba01bcdf --- /dev/null +++ b/json-schema/exampleWorkflow.json @@ -0,0 +1,121 @@ +{ + "schemaVersion": "0.0.1", + "name": "Example workflow", + "parameters": { + "ENV": "production", + "BUCKET": "mybucket" + }, + "schedule": "0 6 * * *", + "timezone": "UTC", + "tasks": [ + { + "node_id": "T1", + "name": "Initial Data Prep", + "input_uri": "/data/input1.csv", + "depends_on": [], + "runtime_environment_name": "python-env", + "runtime_environment_parameters": { + "python_version": "3.9" + }, + "output_formats": [ + "html", + "pdf" + ], + "parameters": { + "cleanup": "true" + }, + "tags": [ + "data-prep" + ], + "compute_type": "small", + "package_input_folder": false + }, + { + "node_id": "T2", + "name": "Combine Data", + "input_uri": "/data/combined_input.csv", + "depends_on": [ + { + "node_id": "T1" + }, + { + "node_id": "T3" + } + ], + "runtime_environment_name": "python-env", + "runtime_environment_parameters": { + "python_version": "3.9" + }, + "output_formats": [ + "notebook" + ], + "parameters": { + "merge_mode": "full-outer" + }, + "tags": [ + "combine", + "processing" + ], + "compute_type": "medium", + "package_input_folder": false + }, + { + "node_id": "T3", + "name": "Preprocess Data", + "input_uri": "/data/raw_data.csv", + "depends_on": [], + "runtime_environment_name": "python-env", + "runtime_environment_parameters": {}, + "output_formats": [], + "parameters": { + "normalize": "true" + }, + "tags": [ + "preprocess" + ], + "compute_type": "large", + "package_input_folder": true + }, + { + "node_id": "T4", + "name": "Analysis", + "input_uri": "/data/analysis_input.csv", + "depends_on": [ + { + "node_id": "T2" + } + ], + "runtime_environment_name": "r-env", + "runtime_environment_parameters": { + "r_version": "4.2" + }, + "output_formats": [ + "html" + ], + "parameters": { + "analysis_type": "timeseries" + }, + "tags": [ + "analysis", + "R" + ], + "compute_type": "medium", + "package_input_folder": false + }, + { + "node_id": "T5", + "name": "Final Output", + "input_uri": "/data/final_result.csv", + "depends_on": [], + "runtime_environment_name": "python-env", + "runtime_environment_parameters": {}, + "output_formats": [], + "parameters": {}, + "tags": [ + "final" + ], + "compute_type": "small", + "package_input_folder": false + } + ] +} diff --git a/json-schema/workflow.schema.json b/json-schema/workflow.schema.json new file mode 100644 index 00000000..5fd6d33f --- /dev/null +++ b/json-schema/workflow.schema.json @@ -0,0 +1,121 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "version": "0.0.1", + "title": "Workflow", + "type": "object", + "properties": { + "schemaVersion": { + "type": "string", + "default": "0.0.1" + }, + "tasks": { + "type": "array", + "items": { + "$ref": "#/definitions/Task" + }, + "description": "Worfklow tasks." + }, + "name": { + "type": "string", + "description": "The name of the workflow." + }, + "parameters": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Optional parameters for the workflow." + }, + "schedule": { + "type": "string", + "description": "Optional schedule in cron format." + }, + "timezone": { + "type": "string", + "description": "Timezone for the schedule." + } + }, + "required": [ + "tasks", + "name" + ], + "definitions": { + "Task": { + "type": "object", + "properties": { + "input_uri": { + "type": "string", + "description": "The URI of the input file." + }, + "runtime_environment_name": { + "type": "string", + "description": "Name of the runtime environment." + }, + "runtime_environment_parameters": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Parameters for the runtime environment." + }, + "output_formats": { + "type": "array", + "items": { + "type": "string" + } + }, + "parameters": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Task-specific parameters." + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags for categorizing the job." + }, + "name": { + "type": "string", + "description": "Name of the job." + }, + "compute_type": { + "type": "string", + "description": "Type of compute resource to use." + }, + "package_input_folder": { + "type": "boolean", + "description": "Whether to package the input folder." + }, + "depends_on": { + "type": "array", + "description": "DAG node IDs of tasks this task depends on (upstream dependencies).", + "items": { + "type": "object", + "properties": { + "node_id": { + "type": "string" + } + }, + "required": [ + "node_id" + ] + } + }, + "node_id": { + "type": "string", + "description": "DAG node ID of this task." + } + }, + "required": [ + "input_uri", + "name", + "node_id", + "depends_on" + ] + } + } +}