Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scalable mono repo data engineering workloads #44

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions knowledge_base/scalable_mono_repo_de/.flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
ignore = E501, W503, F403
max-line-length = 90
max-complexity = 18
select = B,C,E,F,W,T4,B9
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# NOTE - A lot of this can be configured using environments & a re-usable workflow.
name: "Release workflow for staging environment."

# Ensure that only a single job or workflow using the same concurrency group
# runs at a time.
concurrency: 1

# Trigger this workflow whenever a pull request is opened against the repo's
# staging branch
on:
push:
branches:
- staging

jobs:
test:
name: "Test python packages"
runs-on: ubuntu-latest

steps:
# Check out this repo
- uses: actions/checkout@v3

# Use the specified python version
- uses: actions/setup-python@v5
with:
python-version: '3.10.14'

# Download the Databricks CLI for bundle commands
- uses: databricks/setup-cli@main

# Run tests on remote, under the hood the service principle
# runs the tests defined in this repository on serverless compute
- run: make ci-test
working-directory: .
env:
DATABRICKS_HOST: https://company.databricks.com
DATABRICKS_CLIENT_ID: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_ID_STAGING }}
DATABRICKS_CLIENT_SECRET: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_SECRET_STAGING }}
DATABRICKS_CLUSTER_ID: ${{ secrets.YOUR_DATABRICKS_CLUSTER_ID_STAGING }}
DATABRICKS_BUNDLE_ENV: staging

validate:
name: "Validate bundle"
runs-on: ubuntu-latest

# Only run if tests pass
needs:
- test

steps:
# Check out this repo
- uses: actions/checkout@v3

# Download the Databricks CLI for bundle commands
- uses: databricks/setup-cli@main

# Validate the bundle configuration before we try to deploy anything
# Ideally here we would also do something like a "dry-run" when
# functionality exists
- run: databricks bundle validate -t staging
working-directory: .
env:
DATABRICKS_HOST: https://company.databricks.com
DATABRICKS_CLIENT_ID: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_ID_STAGING }}
DATABRICKS_CLIENT_SECRET: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_SECRET_STAGING }}
DATABRICKS_BUNDLE_ENV: staging


deploy:
name: "Deploy bundle"
runs-on: ubuntu-latest

# Only run if validate succeeds
needs:
- validate

steps:
# Check out this repo
- uses: actions/checkout@v3

# Download the Databricks CLI for bundle commands
- uses: databricks/setup-cli@main

# Deploy the bundle to the "staging" target as defined
# in the bundle's configuration file
- run: databricks bundle deploy -t staging --auto-approve
working-directory: .
env:
DATABRICKS_HOST: https://company.databricks.com
DATABRICKS_CLIENT_ID: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_ID_STAGING }}
DATABRICKS_CLIENT_SECRET: ${{ secrets.YOUR_SERVICE_PRINCIPLE_CLIENT_SECRET_STAGING }}
DATABRICKS_BUNDLE_ENV: staging
10 changes: 10 additions & 0 deletions knowledge_base/scalable_mono_repo_de/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.databricks/
build/
dist/
__pycache__/
*.egg-info
.venv/
scratch/**
!scratch/README.md
.vscode
.vscode/*
15 changes: 15 additions & 0 deletions knowledge_base/scalable_mono_repo_de/.pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
repos:
# Linting
- repo: https://github.com/ambv/black
rev: 23.1.0
hooks:
- id: black
language_version: python3.10
# flake8
- repo: https://github.com/PyCQA/flake8.git
rev: 4.0.1
hooks:
- id: flake8
additional_dependencies: [
'flake8-future-annotations==0.0.4',
]
1 change: 1 addition & 0 deletions knowledge_base/scalable_mono_repo_de/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.10.14
64 changes: 64 additions & 0 deletions knowledge_base/scalable_mono_repo_de/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Global vars.
VENV=.venv
PYTHON_VERSION=3.10.14
PYTHON=${VENV}/bin/python

# Define standard colours.
GREEN=\033[0;32m
RED=\033[0;31m
BLUE=\033[0;34m

.PHONY: clean
clean:
### Remove any existing virtual environments & temp files.
@echo "${RED}Removing existing virtual environments."
rm -rf .python-version
rm -rf $(VENV)

@echo "${GREEN}Removing temp files${NORMAL}"
-rm -rf .cache
-rm -rf .pytest_cache
-rm -rf coverage
-rm -rf .coverage
-rm -rf build
-rm -rf */*/build
-rm -rf dist
-rm -rf */*/dist
-rm -rf *.egg-info
-rm -rf */*/*.egg-info
-rm -rf *.whl

build-local-virtualenv:
### Install python version locally using pyenv & set it to local version used
### for development.
@echo "${GREEN}Installing default python version using pyenv."
pyenv install -s $(PYTHON_VERSION)
pyenv local $(PYTHON_VERSION)
@echo "${GREEN}Creating virtual environment."
test -d $(VENV) || $(HOME)/.pyenv/versions/$(PYTHON_VERSION)/bin/python -m venv $(VENV)

@echo "${GREEN}Building root environment for local testing & databricks connect"
. $(VENV)/bin/activate && \
pip install -r requirements-dev.txt && \
pre-commit install

.PHONY: setup
### Setup local virtual environment for testing & development.
setup: clean build-local-virtualenv

.PHONY: test
### Run tests on remote.
test:
@echo "${GREEN}Running tests"
$(PYTHON) -m pytest -s tests/ -v -p no:warnings

build-test: setup test

###########################################
### CI ###
###########################################

ci-test:
### This should probably be cleaned up & improved.
pip install -r requirements-dev.txt && \
python3 -m pytest -s tests/ -v
108 changes: 108 additions & 0 deletions knowledge_base/scalable_mono_repo_de/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Scalable Databricks Asset Bundles (DABs) mono-repo.

This project aims to give users an idea of how you can structure your DABs git repositories in a scalable & effective manner, as well as some general best practices & CI/CD examples.

**This repo is only intended to be used for demonstrative purposes. Myself & Databricks are not liable for any short-comings in this project.**

## Prerequisites

1\. Python versions & environments are managed via. `pyenv`. You can [install pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) using a package manager such as [homebrew](https://docs.brew.sh/):

```
brew update
brew install pyenv
```

## Getting started

1\. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html

2a. Authenticate to your Sandbox / Development workspace, if you have not done so already:
```
$ databricks configure
```

2b. Setup your default Databricks profile in `.databrickscfg` so that any validation & deployment requests are made against that workspace:
```
host = <your_workspace_uri>
serverless_compute_id = auto
token = <your_personal_access_token>
```

**Note:** it is advised that you use serverless compute where possible to run your tests, this provides the shortest feedback loop for development. If you want to use an interactive cluster instead, remove the `serverless_compute_id = auto` flag & replace it with the `cluster_id = <your_cluster_id>` flag.

3\. Setup your local environment for development purposes by running:
```
make setup
```
This creates a local python virtual environment & installs all project dependencies, it also installs `pre-commit` hooks, these are entirely optional.

4\. Verify that your environment is correctly configured by running:

```
make test
```

This will run all package tests defined in `./tests/` remotely in your Databricks workspace on serverless or interactive compute, depending on which you have specified. Alternatively, you _could_ run this locally by containerising spark & integrating it to run your tests.

5\. To deploy a development copy of this project, type:
```
$ databricks bundle deploy --target dev
```
(Note that "dev" is the default target, so the `-t` parameter is optional here.)

This deploys everything that's defined for this project.
For example, the default template would deploy a job called
`[dev yourname] my_workflow_dev` to your workspace.
You can find that job by opening your workpace and clicking on **Workflows**.

6\. Verify that the job has been deployed by running:
```
$ databricks bundle run my_serverless_workflow -t dev
```

You should see something like the following from your IDE:

```
Run URL: https://company.databricks.com/<job_id>/<run_id>

2024-01-01 00:00:00 "[dev <my_user>] my_serverless_workflow_dev" RUNNING
.
.
.
```

You can verify that the job is running by visiting the UI. Once the job has started, you should see the cluster logs in your IDE again:

```
cowsay is installed & has version: 6.1
boto3 is installed & has version: 1.0.0
get_taxis_data is installed & has version: 0.1.0
utils is installed & has version: 0.1.0
+--------------------+---------------------+-------------+-----------+----------+-----------+--------------------+
|tpep_pickup_datetime|tpep_dropoff_datetime|trip_distance|fare_amount|pickup_zip|dropoff_zip|processing_timestamp|
+--------------------+---------------------+-------------+-----------+----------+-----------+--------------------+
| 2016-02-14 16:52:13| 2016-02-14 17:16:04| 4.94| 19.0| 10282| 10171|2024-10-25 15:00:...|
| 2016-02-04 18:44:19| 2016-02-04 18:46:00| 0.28| 3.5| 10110| 10110|2024-10-25 15:00:...|
| 2016-02-17 17:13:57| 2016-02-17 17:17:55| 0.7| 5.0| 10103| 10023|2024-10-25 15:00:...|
| 2016-02-18 10:36:07| 2016-02-18 10:41:45| 0.8| 6.0| 10022| 10017|2024-10-25 15:00:...|
| 2016-02-22 14:14:41| 2016-02-22 14:31:52| 4.51| 17.0| 10110| 10282|2024-10-25 15:00:...|
+--------------------+---------------------+-------------+-----------+----------+-----------+--------------------+
only showing top 5 rows
```

## Intended Usage.

The intended workflow for this project / demo looks something like the following:

1\. Contributors branch off of the remote staging branch for their new features.

2\. As contributors make their development changes locally on this new branch, they can run their tests either locally or in their remote sandbox / development Databricks workspace using a compute resource of their choice & on a DBR of their choice.

3\. Contributors can also deploy their changes to this sandbox / development workspace for integrated testing & to run jobs or workflows if they want to.

4\. Once the contributor is happy with their changes, they can commit their changes up to the remote feature branch & open a PR.

5\. Upon merge into the `staging` branch, the github workflow defined at `.github/workflows` will run the same tests, validation & deployment in a controlled environment & using a service principle. This will deploy all changes to the staging workspace.

6\. Once the deployment has succeeded & further testing in staging has been done, the same process is carried out to deploy into production (this still needs to be done).
79 changes: 79 additions & 0 deletions knowledge_base/scalable_mono_repo_de/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# This is a Databricks asset bundle definition for my_project.
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding databricks_cli_version can help mitigate issues related to the CLI version. Since new features are frequently added, this can be helpful.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ref: #19

name: dabs_bootstrap

include:
- resources/jobs/*.yml
- resources/pipelines/*.yml

# Define re-usable complex variables
variables:
env:
description: Environment value for job name injection.
type: complex
default: dev
default_serverless_env_spec:
description: Default serverless environment configuration (example).
type: complex
default:
client: "1"
dependencies:
- -r "/Workspace${workspace.file_path}/environments/default-requirements.txt"
- ../../src/packages/get_taxis_data/dist/*.whl
- ../../src/packages/utils/dist/*.whl

# Build artifacts using poetry, in this case we only have two
artifacts:
utils_package:
type: whl
build: poetry build
path: src/packages/utils/
get_taxis_data_package:
type: whl
build: poetry build
path: src/packages/get_taxis_data/

targets:
# The 'dev' target, for development purposes. This target is the default
dev:
# We use 'mode: development' to indicate this is a personal development copy:
# - Deployed resources get prefixed with '[dev my_user_name]'
# - Any job schedules and triggers are paused by default
# - The 'development' mode is used for Delta Live Tables pipelines
mode: development
default: true
workspace:
host: https://company.databricks.com

# The 'staging' target, used for UAT deployment - we mimic production here
staging:
# We use 'mode: production' to indicate this is a production deployment
# Doing so enables strict verification of the settings below
mode: production
workspace:
host: https://company.databricks.com
# We always use /Shared/.bundle/${bundle.name} for all resources to make sure we only have a single copy
root_path: /Shared/.bundle/${bundle.name}
run_as:
# This runs as your service principle in staging
service_principal_name: {{ your_service_principle_id }}
# We can use a default env variable to dynamically inject "staging" into our resource names
variables:
env: staging

# The 'prod' target, used for production deployment
prod:
# We use 'mode: production' to indicate this is a production deployment
# Doing so enables strict verification of the settings below
mode: production
workspace:
host: https://company.databricks.com
# We always use /Shared/.bundle/${bundle.name} for all resources to make sure we only have a single copy
root_path: /Shared/.bundle/${bundle.name}
run_as:
# This runs as your service principle in staging
service_principal_name: {{ your_service_principle_id }}
# We can use a default env variable to dynamically inject "prod" into our resource names
variables:
env: prod
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
cowsay==6.1
boto3==1.0
2 changes: 2 additions & 0 deletions knowledge_base/scalable_mono_repo_de/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
testpaths = tests
Loading