Skip to content

Commit b68de1f

Browse files
authored
Merge pull request #1 from coinlist/coinlist-snowflake-binary
works for binary uuids
2 parents c1dde75 + bd4c06a commit b68de1f

13 files changed

+525
-27
lines changed

.github/workflows/CODEOWNERS

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
* @coinlist/data
+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
name: cache_dependencies
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
python_version:
7+
required: true
8+
type: string
9+
poetry_version:
10+
required: true
11+
type: string
12+
outputs:
13+
python_cache_key:
14+
description: "The key of the primary cache of the python dependencies"
15+
value: ${{ jobs.python-cache.outputs.key }}
16+
17+
18+
jobs:
19+
python-cache:
20+
runs-on: ubuntu-latest
21+
outputs:
22+
key: ${{ steps.define-cache-key.outputs.cache_key }}
23+
steps:
24+
- uses: actions/checkout@v3
25+
26+
- name: Setup Python
27+
uses: actions/setup-python@v4
28+
id: setup-python
29+
with:
30+
python-version: '${{ inputs.python_version }}'
31+
32+
- name: Install and configure Poetry
33+
uses: snok/install-poetry@v1
34+
with:
35+
version: ${{ inputs.poetry_version }}
36+
virtualenvs-in-project: true
37+
38+
- name: Define Cache Key
39+
id: define-cache-key
40+
run: |
41+
echo "cache_key=python-${{ runner.os }}--${{ inputs.python_version }}-${{ inputs.poetry_version }}-${{ hashFiles('**/poetry.lock') }}" >> $GITHUB_OUTPUT
42+
43+
- name: Cache venv
44+
id: cached-python
45+
uses: actions/cache@v3
46+
with:
47+
path: .venv
48+
key: ${{ steps.define-cache-key.outputs.cache_key }}
49+
50+
- name: Install dependencies
51+
run: poetry install --no-interaction --no-root
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
name: publish_data-platform-data-diff
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
python_version:
7+
required: true
8+
type: string
9+
poetry_version:
10+
required: true
11+
type: string
12+
python_cache_key:
13+
required: true
14+
type: string
15+
16+
permissions:
17+
id-token: write
18+
contents: read
19+
20+
jobs:
21+
build:
22+
name: 'Publish python data-platform-data-diff'
23+
runs-on: ubuntu-latest
24+
25+
steps:
26+
- name: Setup AWS credentials
27+
uses: aws-actions/configure-aws-credentials@v1
28+
with:
29+
role-to-assume: ${{ secrets.CROSS_ACCOUNT_ROLE_TO_ASSUME }}
30+
aws-region: ${{ secrets.AWS_REGION }}
31+
mask-aws-account-id: 'yes'
32+
33+
- uses: actions/checkout@v3
34+
35+
- name: Setup Python
36+
id: setup-python
37+
uses: actions/setup-python@v4
38+
with:
39+
python-version: '${{ inputs.python_version }}'
40+
41+
- name: Install and configure Poetry
42+
uses: snok/install-poetry@v1
43+
with:
44+
version: ${{ inputs.poetry_version }}
45+
virtualenvs-in-project: true
46+
47+
- name: Restore cached key
48+
id: cache-restore
49+
uses: actions/cache/restore@v3
50+
with:
51+
path: .venv
52+
key: ${{ inputs.python_cache_key }}
53+
54+
- name: Install jq
55+
run: sudo apt-get update && sudo apt-get install -y jq
56+
57+
- name: Set env variables
58+
env:
59+
AWS_REGION: ${{ secrets.AWS_REGION }}
60+
CODEARTIFACT_URL: ${{ secrets.CODEARTIFACT_URL }}
61+
run: |
62+
# Replace placeholder URL with actual repository URL
63+
sed -i "s|PLACEHOLDER_URL|$CODEARTIFACT_URL|" pyproject.toml
64+
65+
VERSION=$(poetry run toml get --toml-path pyproject.toml tool.poetry.version 2>/dev/null) || { echo "FAILED TO GET POETRY VERSION"; exit 1; }
66+
echo $VERSION > version.txt
67+
echo "CURRENT_VERSION=$(cat version.txt)" >> $GITHUB_ENV
68+
69+
- name: Check if version needs to be published
70+
if: ${{ github.ref_name == 'master' }}
71+
env:
72+
AWS_REGION: ${{ secrets.AWS_REGION }}
73+
id: check_version
74+
run: |
75+
if ! aws codeartifact list-package-versions --region $AWS_REGION --domain coinlist --repository data-platform-data-diff --format pypi --package data_diff 2>/dev/null | grep -q "$(cat version.txt | sed 's/\./\\./g')"; then
76+
echo "skip_publish=false" >> $GITHUB_ENV
77+
else
78+
echo "skip_publish=true" >> $GITHUB_ENV
79+
fi
80+
81+
- name: Publish dev version
82+
if: ${{ github.ref_name != 'master' }}
83+
run: |
84+
DEV_VERSION="$CURRENT_VERSION-dev+${GITHUB_SHA:0:7}"
85+
echo $DEV_VERSION > version.txt
86+
poetry run toml set --toml-path pyproject.toml tool.poetry.version $DEV_VERSION || { echo "Failed to set dev version in pyproject.toml"; exit 1; }
87+
poetry config repositories.data-platform-data-diff ${{ secrets.CODEARTIFACT_URL }}
88+
poetry build --format wheel || { echo "Failed to build the wheel"; exit 1; }
89+
poetry publish --repository data-platform-data-diff --username aws --password $(aws codeartifact --region ${{ secrets.AWS_REGION }} get-authorization-token --domain coinlist --query authorizationToken --output text 2>/dev/null) || { echo "Failed to publish the dev package"; exit 1; }
90+
91+
- name: Publish new version
92+
if: ${{ github.ref_name == 'master' }} && ${{ env.skip_publish != 'true' }}
93+
run: |
94+
poetry build --format wheel 2>/dev/null || { echo "Failed to build the wheel"; exit 1; }
95+
poetry publish --repository data-platform-data-diff --username aws --password $(aws codeartifact --region ${{ secrets.AWS_REGION }} get-authorization-token --domain coinlist --query authorizationToken --output text 2>/dev/null) || { echo "Failed to publish the package"; exit 1; }

.github/workflows/ci.yml

+2-9
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,13 @@
11
name: CI-COVER-VERSIONS
22

33
on:
4-
# push:
5-
# paths:
6-
# - '**.py'
7-
# - '.github/workflows/**'
8-
# - '!dev/**'
9-
pull_request:
10-
branches: [ master ]
11-
124
workflow_dispatch:
5+
workflow_call:
136

147
jobs:
158
unit_tests:
169
strategy:
17-
fail-fast: false
10+
fail-fast: true
1811
matrix:
1912
os: [ubuntu-latest]
2013
python-version:

.github/workflows/ci_full.yml

+2-8
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,8 @@
11
name: CI-COVER-DATABASES
22

33
on:
4-
# push:
5-
# paths:
6-
# - '**.py'
7-
# - '.github/workflows/**'
8-
# - '!dev/**'
9-
pull_request:
10-
branches: [ master ]
114
workflow_dispatch:
5+
workflow_call:
126

137
permissions:
148
id-token: write # This is required for requesting the JWT
@@ -17,7 +11,7 @@ permissions:
1711
jobs:
1812
unit_tests:
1913
strategy:
20-
fail-fast: false
14+
fail-fast: true
2115
matrix:
2216
os: [ubuntu-latest]
2317
python-version:

.github/workflows/formatter.yml

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
name: formatter
22
on:
3-
pull_request:
4-
branches: [ master ]
5-
6-
workflow_dispatch:
3+
workflow_dispatch:
4+
workflow_call:
75

86
jobs:
97
linter_name:
@@ -19,4 +17,4 @@ jobs:
1917
- name: Auto commit ruff formatting
2018
uses: stefanzweifel/git-auto-commit-action@v5
2119
with:
22-
commit_message: 'style fixes by ruff'
20+
commit_message: 'style fixes by ruff'
+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
name: PR checks
2+
3+
on:
4+
pull_request: {}
5+
6+
permissions:
7+
id-token: write
8+
contents: read
9+
actions: write
10+
11+
jobs:
12+
cancel:
13+
runs-on: ubuntu-latest
14+
steps:
15+
- uses: styfle/[email protected]
16+
with:
17+
access_token: ${{ github.token }}
18+
19+
setup:
20+
runs-on: ubuntu-latest
21+
needs: [cancel]
22+
outputs:
23+
python_version: ${{ steps.set_var.outputs.python_version }}
24+
poetry_version: ${{ steps.set_var.outputs.poetry_version }}
25+
steps:
26+
- id: set_var
27+
run: |
28+
echo "python_version=3.8" >> $GITHUB_OUTPUT
29+
echo "poetry_version=1.7.1" >> $GITHUB_OUTPUT
30+
31+
perform-ruff-formatting:
32+
needs: [setup]
33+
uses: ./.github/workflows/formatter.yml
34+
35+
cache-dependencies:
36+
needs: [setup, perform-ruff-formatting]
37+
uses: ./.github/workflows/_cache-dependencies.yml
38+
secrets: inherit
39+
with:
40+
python_version: ${{ needs.setup.outputs.python_version }}
41+
poetry_version: ${{ needs.setup.outputs.poetry_version }}
42+
43+
run-unit-test-versions:
44+
needs: [setup]
45+
uses: ./.github/workflows/ci_full.yml
46+
47+
run-unit-test-per-database:
48+
needs: [setup]
49+
uses: ./.github/workflows/ci.yml
50+
51+
publish-data-platform-data-diff:
52+
needs: [setup, run-unit-test-versions, run-unit-test-per-database, cache-dependencies]
53+
uses: ./.github/workflows/_publish-data-platform-data-diff.yml
54+
secrets: inherit
55+
with:
56+
python_version: ${{ needs.setup.outputs.python_version }}
57+
poetry_version: ${{ needs.setup.outputs.poetry_version }}
58+
python_cache_key: ${{ needs.cache-dependencies.outputs.python_cache_key }}

data_diff/abcs/database_types.py

+6
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,12 @@ class String_UUID(ColType_UUID, StringType):
134134
pass
135135

136136

137+
# Snowflake Binary UUID
138+
@attrs.define(frozen=True)
139+
class Binary_UUID(ColType_UUID):
140+
python_type = bytes
141+
142+
137143
@attrs.define(frozen=True)
138144
class String_Alphanum(ColType_Alphanum, StringType):
139145
@staticmethod

data_diff/databases/_connect.py

+5
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from data_diff.databases.duckdb import DuckDB
2626
from data_diff.databases.mssql import MsSQL
2727

28+
from urllib.parse import unquote
29+
2830

2931
@attrs.define(frozen=True)
3032
class MatchUriPath:
@@ -196,6 +198,9 @@ def connect_to_uri(self, db_uri: str, thread_count: Optional[int] = 1, **kwargs)
196198
if dsn.password:
197199
kw["password"] = dsn.password
198200

201+
# snowflake connector can handle unquoted values, but data-diff cannot
202+
# results in error if user or password is encoded
203+
# https://github.com/datafold/data-diff/issues/428
199204
kw = {k: v for k, v in kw.items() if v is not None}
200205

201206
if issubclass(cls, ThreadedDatabase):

data_diff/databases/base.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from data_diff.queries.api import Expr, table, Select, SKIP, Explain, Code, this
2424
from data_diff.queries.ast_classes import (
2525
Alias,
26+
BinBoolOp,
2627
BinOp,
2728
CaseWhen,
2829
Cast,
@@ -64,6 +65,7 @@
6465
Float,
6566
Native_UUID,
6667
String_UUID,
68+
Binary_UUID,
6769
String_Alphanum,
6870
String_VaryingAlphanum,
6971
TemporalType,
@@ -482,6 +484,22 @@ def render_tableop(self, parent_c: Compiler, elem: TableOp) -> str:
482484
def render__resolvecolumn(self, c: Compiler, elem: _ResolveColumn) -> str:
483485
return self.compile(c, elem._get_resolved())
484486

487+
def modify_string_where_clause(self, col, where_clause):
488+
# NOTE: snowflake specific issue with Binary columns
489+
return where_clause.replace(f'"{col}"', f"TO_VARCHAR(\"{col}\", 'UTF-8')")
490+
491+
def check_for_binary_cols(self, where_exprs):
492+
binary_uuid_columns = set()
493+
for expr in where_exprs:
494+
if isinstance(expr, BinBoolOp):
495+
for arg in expr.args:
496+
if isinstance(arg, _ResolveColumn):
497+
resolved_column = arg.resolved
498+
if isinstance(resolved_column, Column) and resolved_column.source_table.schema:
499+
if isinstance(resolved_column.type, Binary_UUID):
500+
binary_uuid_columns.add(resolved_column.name)
501+
return binary_uuid_columns
502+
485503
def render_select(self, parent_c: Compiler, elem: Select) -> str:
486504
c: Compiler = attrs.evolve(parent_c, in_select=True) # .add_table_context(self.table)
487505
compile_fn = functools.partial(self.compile, c)
@@ -497,7 +515,13 @@ def render_select(self, parent_c: Compiler, elem: Select) -> str:
497515
select += f" FROM {self.PLACEHOLDER_TABLE}"
498516

499517
if elem.where_exprs:
500-
select += " WHERE " + " AND ".join(map(compile_fn, elem.where_exprs))
518+
where_clause = " WHERE " + " AND ".join(map(compile_fn, elem.where_exprs))
519+
# post processing step for snowfake BINARAY_UUID columns
520+
if parent_c.dialect.name == "Snowflake":
521+
binary_uuids = self.check_for_binary_cols(elem.where_exprs)
522+
for binary_uuid in binary_uuids:
523+
where_clause = self.modify_string_where_clause(binary_uuid, where_clause)
524+
select += where_clause
501525

502526
if elem.group_by_exprs:
503527
select += " GROUP BY " + ", ".join(map(compile_fn, elem.group_by_exprs))
@@ -836,6 +860,9 @@ def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
836860
"""Creates an SQL expression, that strips uuids of artifacts like whitespace."""
837861
if isinstance(coltype, String_UUID):
838862
return f"TRIM({value})"
863+
# converts Binary to VARCHAR for Snowflake
864+
elif isinstance(coltype, Binary_UUID):
865+
return f"TRIM(TO_VARCHAR({value}, 'UTF-8'))"
839866
return self.to_string(value)
840867

841868
def normalize_json(self, value: str, _coltype: JSON) -> str:

0 commit comments

Comments
 (0)