GoogleCloudPlatform · nehanene15 · Aug 31, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 31, 2023
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ The Data Validation Tool (DVT) provides an automated and repeatable solution to
 perform this task.
 
 DVT supports the following validations:
-* Column validation (count, sum, avg, min, max, group by)
+* Column validation (count, sum, avg, min, max, stddev, group by)
 * Row validation (Not supported for FileSystem connections)
 * Schema validation
 * Custom Query validation
@@ -106,6 +106,7 @@ data-validation (--verbose or -v) (--log-level or -ll) validate column
   [--min COLUMNS]       Comma separated list of columns for min or * for all numeric
   [--max COLUMNS]       Comma separated list of columns for max or * for all numeric
   [--avg COLUMNS]       Comma separated list of columns for avg or * for all numeric
+  [--std COLUMNS]       Comma separated list of columns for stddev_samp or * for all numeric
   [--bq-result-handler or -bqrh PROJECT_ID.DATASET.TABLE]
                         BigQuery destination for validation results. Defaults to stdout.
                         See: *Validation Reports* section
@@ -311,6 +312,7 @@ data-validation (--verbose or -v) (--log-level or -ll) validate custom-query col
   [--min COLUMNS]       Comma separated list of columns for min or * for all numeric
   [--max COLUMNS]       Comma separated list of columns for max or * for all numeric
   [--avg COLUMNS]       Comma separated list of columns for avg or * for all numeric
+  [--std COLUMNS]       Comma separated list of columns for stddev_samp or * for all numeric
   [--bq-result-handler or -bqrh PROJECT_ID.DATASET.TABLE]
                         BigQuery destination for validation results. Defaults to stdout.
                         See: *Validation Reports* section
@@ -520,8 +522,8 @@ Functions, and other deployment services.
 ### Aggregated Fields
 
 Aggregate fields contain the SQL fields that you want to produce an aggregate
-for. Currently the functions `COUNT()`, `AVG()`, `SUM()`, `MIN()`, and `MAX()`
-are supported.
+for. Currently the functions `COUNT()`, `AVG()`, `SUM()`, `MIN()`, `MAX()`,
+and `STDDEV_SAMP()` are supported.
 
 Here is a sample aggregate config:
 ```yaml

@@ -114,6 +114,11 @@ def get_aggregate_config(args, config_manager: ConfigManager):
         aggregate_configs += config_manager.build_config_column_aggregates(
             "bit_xor", col_args, supported_data_types, cast_to_bigint=cast_to_bigint
         )
+    if args.std:
+        col_args = None if args.std == "*" else cli_tools.get_arg_list(args.std)
+        aggregate_configs += config_manager.build_config_column_aggregates(
+            "std", col_args, supported_data_types, cast_to_bigint=cast_to_bigint
+        )
     return aggregate_configs
 
 

@@ -545,6 +545,11 @@ def _configure_column_parser(column_parser):
         "-bit_xor",
         help="Comma separated list of columns for hashing a concatenate 'col_a,col_b' or * for all columns",
     )
+    optional_arguments.add_argument(
+        "--std",
+        "-std",
+        help="Comma separated list of columns for standard deviation 'col_a,col_b' or * for all columns",
+    )
     optional_arguments.add_argument(
         "--grouped-columns",
         "-gc",
@@ -785,6 +790,11 @@ def _configure_custom_query_column_parser(custom_query_column_parser):
         "-bit_xor",
         help="Comma separated list of columns for hashing a concatenate 'col_a,col_b' or * for all columns",
     )
+    optional_arguments.add_argument(
+        "--std",
+        "-std",
+        help="Comma separated list of columns for standard deviation 'col_a,col_b' or * for all columns",
+    )
     optional_arguments.add_argument(
         "--wildcard-include-string-len",
         "-wis",

@@ -96,6 +96,15 @@ def bit_xor(field_name=None, alias=None, cast=None):
             cast=cast,
         )
 
+    @staticmethod
+    def std(field_name=None, alias=None, cast=None):
+        return AggregateField(
+            ibis.expr.types.NumericColumn.std,
+            field_name=field_name,
+            alias=alias,
+            cast=cast,
+        )
+
     def compile(self, ibis_table):
         if self.field_name:
             agg_field = self.expr(ibis_table[self.field_name])

@@ -66,6 +66,12 @@
             consts.CONFIG_TARGET_COLUMN: "birth_year",
             consts.CONFIG_FIELD_ALIAS: "min_birth_year",
         },
+        {
+            consts.CONFIG_TYPE: "std",
+            consts.CONFIG_SOURCE_COLUMN: "tripduration",
+            consts.CONFIG_TARGET_COLUMN: "tripduration",
+            consts.CONFIG_FIELD_ALIAS: "std_tripduration",
+        },
     ],
     consts.CONFIG_FORMAT: "table",
     consts.CONFIG_FILTER_STATUS: None,
@@ -215,19 +221,6 @@
     consts.CONFIG_FILTER_STATUS: None,
 }
 
-CONFIG_SCHEMA_VALIDATION = {
-    # BigQuery Specific Connection Config
-    consts.CONFIG_SOURCE_CONN: BQ_CONN,
-    consts.CONFIG_TARGET_CONN: BQ_CONN,
-    # Validation Type
-    consts.CONFIG_TYPE: "Schema",
-    # Configuration Required Depending on Validator Type
-    consts.CONFIG_SCHEMA_NAME: "bigquery-public-data.new_york_citibike",
-    consts.CONFIG_TABLE_NAME: "citibike_trips",
-    consts.CONFIG_FORMAT: "table",
-    consts.CONFIG_FILTER_STATUS: None,
-}
-
 BQ_CONN_NAME = "bq-integration-test"
 CLI_CONFIG_FILE = "example_test.yaml"
 
@@ -427,12 +420,16 @@ def test_count_validator():
     min_birth_year_value = df[df["validation_name"] == "min_birth_year"][
         "source_agg_value"
     ].values[0]
+    std_tripduration_value = df[df["validation_name"] == "std_tripduration"][
+        "source_agg_value"
+    ].values[0]
 
     assert float(count_value) > 0
     assert float(count_tripduration_value) > 0
     assert float(avg_tripduration_value) > 0
     assert float(max_birth_year_value) > 0
     assert float(min_birth_year_value) > 0
+    assert float(std_tripduration_value) > 0
     assert (
         df["source_agg_value"].astype(float).sum()
         == df["target_agg_value"].astype(float).sum()
@@ -465,14 +462,6 @@ def test_numeric_types():
         )
 
 
-def test_schema_validation():
-    validator = data_validation.DataValidation(CONFIG_SCHEMA_VALIDATION, verbose=True)
-    df = validator.execute()
-
-    for validation in df.to_dict(orient="records"):
-        assert validation["validation_status"] == consts.VALIDATION_STATUS_SUCCESS
-
-
 def test_cli_store_yaml_then_run_gcs():
     """Test storing and retrieving validation YAML when GCS env var is set."""
     # Store BQ Connection

@@ -517,7 +517,6 @@ def _day_of_week_name(t, op):
         ops.Min: _reduction('min'),
         ops.Max: _reduction('max'),
         ops.Variance: variance_reduction('var', suffix={'sample': '', 'pop': 'p'}),
-        ops.StandardDev: variance_reduction('stdev', suffix={'sample': '', 'pop': 'p'}),
         ops.RandomScalar: _random,
         ops.TimestampNow: lambda *args: sa.func.timezone('UTC', sa.func.now()),
         ops.CumulativeAll: unary(sa.func.bool_and),