databricks · dantaylrr · Oct 25, 2024 · Oct 25, 2024 · chander-pal · Nov 12, 2024
diff --git a/knowledge_base/scalable_mono_repo_de/.flake8 b/knowledge_base/scalable_mono_repo_de/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E501, W503, F403
+max-line-length = 90
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
diff --git a/knowledge_base/scalable_mono_repo_de/.github/workflows/staging_workflow.yml b/knowledge_base/scalable_mono_repo_de/.github/workflows/staging_workflow.yml
@@ -0,0 +1,93 @@
+# NOTE - A lot of this can be configured using environments & a re-usable workflow.
+name: "Release workflow for staging environment."
+
+# Ensure that only a single job or workflow using the same concurrency group
+# runs at a time.
+concurrency: 1
+
+# Trigger this workflow whenever a pull request is opened against the repo's
+# staging branch
+on:
+  push:
+    branches:
+      - staging
+
+jobs:
+  test:
+    name: "Test python packages"
+    runs-on: ubuntu-latest
+
+    steps:
+      # Check out this repo
+      - uses: actions/checkout@v3
+
+      # Use the specified python version
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10.14'
+
+      # Download the Databricks CLI for bundle commands
+      - uses: databricks/setup-cli@main
+
+      # Run tests on remote, under the hood the service principle
+      # runs the tests defined in this repository on serverless compute
+      - run: make ci-test
+        working-directory: .
+        env:
+          DATABRICKS_HOST: https://company.databricks.com
+          DATABRICKS_CLIENT_ID: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_ID_STAGING }}
+          DATABRICKS_CLIENT_SECRET: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_SECRET_STAGING }}
+          DATABRICKS_CLUSTER_ID: ${{ secrets.YOUR_DATABRICKS_CLUSTER_ID_STAGING }}
+          DATABRICKS_BUNDLE_ENV: staging
+
+  validate:
+    name: "Validate bundle"
+    runs-on: ubuntu-latest
+
+    # Only run if tests pass
+    needs:
+      - test
+
+    steps:
+      # Check out this repo
+      - uses: actions/checkout@v3
+
+      # Download the Databricks CLI for bundle commands
+      - uses: databricks/setup-cli@main
+
+      # Validate the bundle configuration before we try to deploy anything
+      # Ideally here we would also do something like a "dry-run" when
+      # functionality exists
+      - run: databricks bundle validate -t staging
+        working-directory: .
+        env:
+          DATABRICKS_HOST: https://company.databricks.com
+          DATABRICKS_CLIENT_ID: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_ID_STAGING }}
+          DATABRICKS_CLIENT_SECRET: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_SECRET_STAGING }}
+          DATABRICKS_BUNDLE_ENV: staging
+
+
+  deploy:
+    name: "Deploy bundle"
+    runs-on: ubuntu-latest
+
+    # Only run if validate succeeds
+    needs:
+      - validate
+
+    steps:
+      # Check out this repo
+      - uses: actions/checkout@v3
+
+      # Download the Databricks CLI for bundle commands
+      - uses: databricks/setup-cli@main
+
+      # Deploy the bundle to the "staging" target as defined
+      # in the bundle's configuration file
+      - run: databricks bundle deploy -t staging --auto-approve
+        working-directory: .
+        env:
+          DATABRICKS_HOST: https://company.databricks.com
+          DATABRICKS_CLIENT_ID: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_ID_STAGING }}
+          DATABRICKS_CLIENT_SECRET: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_SECRET_STAGING }}
+          DATABRICKS_BUNDLE_ENV: staging
diff --git a/knowledge_base/scalable_mono_repo_de/.gitignore b/knowledge_base/scalable_mono_repo_de/.gitignore
@@ -0,0 +1,10 @@
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
+.vscode
+.vscode/*
diff --git a/knowledge_base/scalable_mono_repo_de/.pre-commit-config.yaml b/knowledge_base/scalable_mono_repo_de/.pre-commit-config.yaml
@@ -0,0 +1,15 @@
+repos:
+    # Linting
+-   repo: https://github.com/ambv/black
+    rev: 23.1.0
+    hooks:
+    - id: black
+      language_version: python3.10
+    # flake8
+-   repo: https://github.com/PyCQA/flake8.git
+    rev: 4.0.1
+    hooks:
+    - id: flake8
+      additional_dependencies: [
+        'flake8-future-annotations==0.0.4',
+      ]
diff --git a/knowledge_base/scalable_mono_repo_de/.python-version b/knowledge_base/scalable_mono_repo_de/.python-version
@@ -0,0 +1 @@
+3.10.14
diff --git a/knowledge_base/scalable_mono_repo_de/Makefile b/knowledge_base/scalable_mono_repo_de/Makefile
@@ -0,0 +1,64 @@
+# Global vars.
+VENV=.venv
+PYTHON_VERSION=3.10.14
+PYTHON=${VENV}/bin/python
+
+# Define standard colours.
+GREEN=\033[0;32m
+RED=\033[0;31m
+BLUE=\033[0;34m
+
+.PHONY: clean
+clean:
+### Remove any existing virtual environments & temp files.
+	@echo "${RED}Removing existing virtual environments."
+	rm -rf .python-version
+	rm -rf $(VENV)
+
+	@echo "${GREEN}Removing temp files${NORMAL}"
+	-rm -rf .cache
+	-rm -rf .pytest_cache
+	-rm -rf coverage
+	-rm -rf .coverage
+	-rm -rf build
+	-rm -rf */*/build
+	-rm -rf dist
+	-rm -rf */*/dist
+	-rm -rf *.egg-info
+	-rm -rf */*/*.egg-info
+	-rm -rf *.whl
+
+build-local-virtualenv:
+### Install python version locally using pyenv & set it to local version used
+### for development.
+	@echo "${GREEN}Installing default python version using pyenv."
+	pyenv install -s $(PYTHON_VERSION)
+	pyenv local $(PYTHON_VERSION)
+	@echo "${GREEN}Creating virtual environment."
+	test -d $(VENV) || $(HOME)/.pyenv/versions/$(PYTHON_VERSION)/bin/python -m venv $(VENV)
+
+	@echo "${GREEN}Building root environment for local testing & databricks connect"
+	. $(VENV)/bin/activate && \
+	pip install -r requirements-dev.txt && \
+	pre-commit install
+
+.PHONY: setup
+### Setup local virtual environment for testing & development.
+setup: clean build-local-virtualenv
+
+.PHONY: test
+### Run tests on remote.
+test:
+	@echo "${GREEN}Running tests"
+	$(PYTHON) -m pytest -s tests/ -v -p no:warnings
+
+build-test: setup test
+
+###########################################
+### 				CI					###
+###########################################
+
+ci-test:
+### This should probably be cleaned up & improved.
+	pip install -r requirements-dev.txt && \
+	python3 -m pytest -s tests/ -v
diff --git a/knowledge_base/scalable_mono_repo_de/README.md b/knowledge_base/scalable_mono_repo_de/README.md
@@ -0,0 +1,108 @@
+# Scalable Databricks Asset Bundles (DABs) mono-repo.
+
+This project aims to give users an idea of how you can structure your DABs git repositories in a scalable & effective manner, as well as some general best practices & CI/CD examples.
+
+**This repo is only intended to be used for demonstrative purposes. Myself & Databricks are not liable for any short-comings in this project.**
+
+## Prerequisites
+
+1\. Python versions & environments are managed via. `pyenv`. You can [install pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) using a package manager such as [homebrew](https://docs.brew.sh/):
+
+```
+brew update
+brew install pyenv
+```
+
+## Getting started
+
+1\. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
+
+2a. Authenticate to your Sandbox / Development workspace, if you have not done so already:
+   ```
+   $ databricks configure
+   ```
+
+2b. Setup your default Databricks profile in `.databrickscfg` so that any validation & deployment requests are made against that workspace:
+   ```
+   host = <your_workspace_uri>
+   serverless_compute_id = auto
+   token = <your_personal_access_token>
+   ```
+
+**Note:** it is advised that you use serverless compute where possible to run your tests, this provides the shortest feedback loop for development. If you want to use an interactive cluster instead, remove the `serverless_compute_id = auto` flag & replace it with the `cluster_id = <your_cluster_id>` flag.
+
+3\. Setup your local environment for development purposes by running:
+   ```
+   make setup
+   ```
+This creates a local python virtual environment & installs all project dependencies, it also installs `pre-commit` hooks, these are entirely optional.
+
+4\. Verify that your environment is correctly configured by running:
+
+   ```
+   make test
+   ```
+
+This will run all package tests defined in `./tests/` remotely in your Databricks workspace on serverless or interactive compute, depending on which you have specified. Alternatively, you _could_ run this locally by containerising spark & integrating it to run your tests.
+
+5\. To deploy a development copy of this project, type:
+   ```
+   $ databricks bundle deploy --target dev
+   ```
+(Note that "dev" is the default target, so the `-t` parameter is optional here.)
+
+This deploys everything that's defined for this project.
+For example, the default template would deploy a job called
+`[dev yourname] my_workflow_dev` to your workspace.
+You can find that job by opening your workpace and clicking on **Workflows**.
+
+6\. Verify that the job has been deployed by running:
+   ```
+   $ databricks bundle run my_serverless_workflow -t dev
+   ```
+
+You should see something like the following from your IDE:
+
+```
+Run URL: https://company.databricks.com/<job_id>/<run_id>
+
+2024-01-01 00:00:00 "[dev <my_user>] my_serverless_workflow_dev" RUNNING
+.
+.
+.
+```
+
+You can verify that the job is running by visiting the UI. Once the job has started, you should see the cluster logs in your IDE again:
+
+```
+cowsay is installed & has version: 6.1
+boto3 is installed & has version: 1.0.0
+get_taxis_data is installed & has version: 0.1.0
+utils is installed & has version: 0.1.0
++--------------------+---------------------+-------------+-----------+----------+-----------+--------------------+
+|tpep_pickup_datetime|tpep_dropoff_datetime|trip_distance|fare_amount|pickup_zip|dropoff_zip|processing_timestamp|
++--------------------+---------------------+-------------+-----------+----------+-----------+--------------------+
+| 2016-02-14 16:52:13|  2016-02-14 17:16:04|         4.94|       19.0|     10282|      10171|2024-10-25 15:00:...|
+| 2016-02-04 18:44:19|  2016-02-04 18:46:00|         0.28|        3.5|     10110|      10110|2024-10-25 15:00:...|
+| 2016-02-17 17:13:57|  2016-02-17 17:17:55|          0.7|        5.0|     10103|      10023|2024-10-25 15:00:...|
+| 2016-02-18 10:36:07|  2016-02-18 10:41:45|          0.8|        6.0|     10022|      10017|2024-10-25 15:00:...|
+| 2016-02-22 14:14:41|  2016-02-22 14:31:52|         4.51|       17.0|     10110|      10282|2024-10-25 15:00:...|
++--------------------+---------------------+-------------+-----------+----------+-----------+--------------------+
+only showing top 5 rows
+```
+
+## Intended Usage.
+
+The intended workflow for this project / demo looks something like the following:
+
+1\. Contributors branch off of the remote staging branch for their new features.
+
+2\. As contributors make their development changes locally on this new branch, they can run their tests either locally or in their remote sandbox / development Databricks workspace using a compute resource of their choice & on a DBR of their choice.
+
+3\. Contributors can also deploy their changes to this sandbox / development workspace for integrated testing & to run jobs or workflows if they want to.
+
+4\. Once the contributor is happy with their changes, they can commit their changes up to the remote feature branch & open a PR.
+
+5\. Upon merge into the `staging` branch, the github workflow defined at `.github/workflows` will run the same tests, validation & deployment in a controlled environment & using a service principle. This will deploy all changes to the staging workspace.
+
+6\. Once the deployment has succeeded & further testing in staging has been done, the same process is carried out to deploy into production (this still needs to be done).
diff --git a/knowledge_base/scalable_mono_repo_de/databricks.yml b/knowledge_base/scalable_mono_repo_de/databricks.yml
@@ -0,0 +1,79 @@
+# This is a Databricks asset bundle definition for my_project.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: dabs_bootstrap
+
+include:
+  - resources/jobs/*.yml
+  - resources/pipelines/*.yml
+
+# Define re-usable complex variables
+variables:
+  env:
+    description: Environment value for job name injection.
+    type: complex
+    default: dev
+  default_serverless_env_spec:
+      description: Default serverless environment configuration (example).
+      type: complex
+      default:
+        client: "1"
+        dependencies:
+          - -r "/Workspace${workspace.file_path}/environments/default-requirements.txt"
+          - ../../src/packages/get_taxis_data/dist/*.whl
+          - ../../src/packages/utils/dist/*.whl
+
+# Build artifacts using poetry, in this case we only have two
+artifacts:
+  utils_package:
+    type: whl
+    build: poetry build
+    path: src/packages/utils/
+  get_taxis_data_package:
+    type: whl
+    build: poetry build
+    path: src/packages/get_taxis_data/
+
+targets:
+  # The 'dev' target, for development purposes. This target is the default
+  dev:
+    # We use 'mode: development' to indicate this is a personal development copy:
+    # - Deployed resources get prefixed with '[dev my_user_name]'
+    # - Any job schedules and triggers are paused by default
+    # - The 'development' mode is used for Delta Live Tables pipelines
+    mode: development
+    default: true
+    workspace:
+      host: https://company.databricks.com
+
+  # The 'staging' target, used for UAT deployment - we mimic production here
+  staging:
+    # We use 'mode: production' to indicate this is a production deployment
+    # Doing so enables strict verification of the settings below
+    mode: production
+    workspace:
+      host: https://company.databricks.com
+      # We always use /Shared/.bundle/${bundle.name} for all resources to make sure we only have a single copy
+      root_path: /Shared/.bundle/${bundle.name}
+    run_as:
+      # This runs as your service principle in staging
+      service_principal_name: {{ your_service_principle_id }}
+    # We can use a default env variable to dynamically inject "staging" into our resource names
+    variables:
+      env: staging
+
+  # The 'prod' target, used for production deployment
+  prod:
+    # We use 'mode: production' to indicate this is a production deployment
+    # Doing so enables strict verification of the settings below
+    mode: production
+    workspace:
+      host: https://company.databricks.com
+      # We always use /Shared/.bundle/${bundle.name} for all resources to make sure we only have a single copy
+      root_path: /Shared/.bundle/${bundle.name}
+    run_as:
+      # This runs as your service principle in staging
+      service_principal_name: {{ your_service_principle_id }}
+    # We can use a default env variable to dynamically inject "prod" into our resource names
+    variables:
+      env: prod
diff --git a/knowledge_base/scalable_mono_repo_de/environments/default-requirements.txt b/knowledge_base/scalable_mono_repo_de/environments/default-requirements.txt
@@ -0,0 +1,2 @@
+cowsay==6.1
+boto3==1.0
diff --git a/knowledge_base/scalable_mono_repo_de/pytest.ini b/knowledge_base/scalable_mono_repo_de/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+testpaths = tests