Skip to content

Commit

Permalink
fix: Hash all bug, noxfile updates (GoogleCloudPlatform#413)
Browse files Browse the repository at this point in the history
* fix: hash '*', updated docs, fixed nox failures due to new click release, cleaned up arg parser

* docs: update row level validation
  • Loading branch information
nehanene15 committed Mar 31, 2022
1 parent 783cdf8 commit fc73e21
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 23 deletions.
19 changes: 13 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Data Validation Tool
# Data Validation Tool (Beta)

The Data Validation Tool (Beta) is an open sourced Python CLI tool based on the
[Ibis framework](https://ibis-project.org/docs/tutorial/01-Introduction-to-Ibis.html)
Expand All @@ -12,10 +12,12 @@ after each migration step (e.g. data and schema migration, SQL script
translation, ETL migration, etc.). The Data Validation Tool (DVT) provides an
automated and repeatable solution to perform this task.

DVT supports the following validation types: * Table level * Table row count *
Group by row count * Column aggregation * Filters and limits * Column level *
Full column data type * Row level hash comparison (BigQuery tables only) * Raw
SQL exploration * Run custom queries on different data sources
DVT supports the following validations:
* Column validation (count, sum, avg, min/max, group_by)
* Row level validation
* Schema validation
* Custom Query validation
* RawSQL exploration

DVT supports the following connection types:

Expand All @@ -35,6 +37,10 @@ DVT supports the following connection types:
The [Connections](docs/connections.md) page provides details about how to create
and list connections for the validation tool.

### Disclaimer
This is not an officially supported Google product. Please be aware that bugs may lurk, and that we reserve the right to make small backwards-incompatible changes. Feel free to open bugs or feature requests, or contribute directly
(see [CONTRIBUTING.md](CONTRIBUTING.md) for details).

## Installation

The [Installation](docs/installation.md) page describes the prerequisites and
Expand Down Expand Up @@ -136,7 +142,8 @@ used to run powerful validations without writing any queries.

#### Row Validations

(Note: Row hash validation is currently only supported for BigQuery, Teradata, and Imapala/Hive)
(Note: Row hash validation is currently only supported for BigQuery, Teradata, and Imapala/Hive. Struct and array
data types are not currently supported.)

Below is the command syntax for row validations. In order to run row level
validations you need to pass a `--primary-key` flag which defines what field(s)
Expand Down
11 changes: 7 additions & 4 deletions data_validation/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def build_config_from_args(args, config_manager):
config_manager (ConfigManager): Validation config manager instance.
"""
config_manager.append_calculated_fields(get_calculated_config(args, config_manager))

if config_manager.validation_type == consts.COLUMN_VALIDATION:
config_manager.append_aggregates(get_aggregate_config(args, config_manager))
if args.grouped_columns is not None:
Expand All @@ -151,15 +152,16 @@ def build_config_from_args(args, config_manager):
config_manager.append_comparison_fields(
config_manager.build_config_comparison_fields(comparison_fields)
)
config_manager.append_dependent_aliases(comparison_fields)
if args.hash != "*":
config_manager.append_dependent_aliases(comparison_fields)

if args.primary_keys is not None:
primary_keys = cli_tools.get_arg_list(args.primary_keys)
config_manager.append_primary_keys(
config_manager.build_config_comparison_fields(primary_keys)
)
config_manager.append_dependent_aliases(primary_keys)

# TODO(GH#18): Add query filter config logic
if args.hash != "*":
config_manager.append_dependent_aliases(primary_keys)

if config_manager.validation_type == consts.CUSTOM_QUERY:
config_manager.append_aggregates(get_aggregate_config(args, config_manager))
Expand All @@ -169,6 +171,7 @@ def build_config_from_args(args, config_manager):
if args.target_query_file is not None:
query_file = cli_tools.get_arg_list(args.target_query_file)
config_manager.append_target_query_file(query_file)

return config_manager


Expand Down
19 changes: 7 additions & 12 deletions data_validation/cli_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,14 +403,10 @@ def _configure_row_parser(row_parser):
"-comp-fields",
help="Individual columns to compare. If comparing a calculated field use the column alias.",
)
row_parser.add_argument(
"--calculated-fields",
"-calc-fields",
help="list of calculated fields to generate.",
)
row_parser.add_argument(
"--primary-keys",
"-pk",
required=True,
help="Comma separated list of primary key columns 'col_a,col_b'",
)
row_parser.add_argument(
Expand Down Expand Up @@ -488,11 +484,6 @@ def _configure_column_parser(column_parser):
"-comp-fields",
help="list of fields to perform exact comparisons to. Use column aliases if this is calculated.",
)
column_parser.add_argument(
"--calculated-fields",
"-calc-fields",
help="list of calculated fields to generate.",
)
column_parser.add_argument(
"--grouped-columns",
"-gc",
Expand Down Expand Up @@ -612,8 +603,12 @@ def _configure_custom_query_parser(custom_query_parser):


def _add_common_arguments(parser):
parser.add_argument("--source-conn", "-sc", help="Source connection name")
parser.add_argument("--target-conn", "-tc", help="Target connection name")
parser.add_argument(
"--source-conn", "-sc", required=True, help="Source connection name"
)
parser.add_argument(
"--target-conn", "-tc", required=True, help="Target connection name"
)
parser.add_argument(
"--tables-list",
"-tbls",
Expand Down
4 changes: 3 additions & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ def lint(session):
serious code quality issues.
"""

_setup_session_requirements(session, extra_packages=["flake8", "black==19.10b0"])
_setup_session_requirements(
session, extra_packages=["flake8", "black==19.10b0", "click==8.0.4"]
)
session.install("--upgrade", "pip", "wheel")
session.run("flake8", "data_validation")
session.run("flake8", "tests")
Expand Down
13 changes: 13 additions & 0 deletions third_party/ibis/ibis_addon/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from google.cloud import bigquery
from ibis_bigquery.client import _DTYPE_TO_IBIS_TYPE
from ibis_bigquery.datatypes import ibis_type_to_bigquery_type, TypeTranslationContext, trans_numeric_udf
import ibis.expr.datatypes as dt
from ibis.backends.pandas.client import _inferable_pandas_dtypes
import pyarrow
Expand All @@ -33,3 +34,15 @@
_inferable_pandas_dtypes['datetime64'] = dt.timestamp
_inferable_pandas_dtypes['datetime'] = dt.timestamp
_inferable_pandas_dtypes['time'] = dt.time

# Patch Bug in Ibis BQ that was fixed in version 2.1.1
@ibis_type_to_bigquery_type.register(dt.Decimal, TypeTranslationContext)
def trans_numeric(t, context):
if (t.precision, t.scale) != (38, 9):
raise TypeError(
'BigQuery only supports decimal types with precision of 38 and '
'scale of 9'
)
return 'NUMERIC'

trans_numeric_udf = trans_numeric

0 comments on commit fc73e21

Please sign in to comment.