diff --git a/data_validation/consts.py b/data_validation/consts.py index e11edb52c..1ecc61870 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -168,9 +168,6 @@ "pct_threshold", ] -# Constants for named columns used in generate partitions -# these cannot conflict with primary key column names -DVT_NTILE_COL = "dvt_ntile" -DVT_PART_NO = "dvt_part_no" -DVT_FIRST_PRE = "dvt_first_" # prefix for first_element_column names -DVT_LAST_PRE = "dvt_last_" # prefix for last_element_column names +# Constants for the named column used in generate partitions +# this cannot conflict with primary key column names +DVT_POS_COL = "dvt_pos_num" diff --git a/data_validation/partition_builder.py b/data_validation/partition_builder.py index accac6358..b8f940093 100644 --- a/data_validation/partition_builder.py +++ b/data_validation/partition_builder.py @@ -14,6 +14,7 @@ import os import ibis +import pandas import logging from typing import List, Dict from argparse import Namespace @@ -156,6 +157,13 @@ def _get_partition_key_filters(self) -> List[List[str]]: source_count = source_partition_row_builder.get_count() target_count = target_partition_row_builder.get_count() + # For some reason Teradata connector returns a dataframe with the count element, + # while the other connectors return a numpy.int64 value + if isinstance(source_count, pandas.DataFrame): + source_count = source_count.values[0][0] + if isinstance(target_count, pandas.DataFrame): + target_count = target_count.values[0][0] + if abs(source_count - target_count) > source_count * 0.1: logging.warning( "Source and Target table row counts vary by more than 10%," @@ -169,49 +177,46 @@ def _get_partition_key_filters(self) -> List[List[str]]: else source_count ) - # First we use the ntile aggregate function and divide assign a partition - # number to each row in the source table + # First we number each row in the source table. Using row_number instead of ntile since it is + # available on all platforms (Teradata does not support NTILE). For our purposes, it is likely + # more efficient window1 = ibis.window(order_by=self.primary_keys) - nt = ( - source_table[self.primary_keys[0]] - .ntile(buckets=number_of_part) - .over(window1) - .name(consts.DVT_NTILE_COL) - ) - dvt_nt = self.primary_keys.copy() - dvt_nt.append(nt) - partitioned_table = source_table.select(dvt_nt) - # Partitioned table is just the primary key columns in the source table along with - # an additional column with the partition number associated with each row. - - # We are interested in only the primary key values at the begining of - # each partitition - the following window groups by partition number - window2 = ibis.window( - order_by=self.primary_keys, group_by=[consts.DVT_NTILE_COL] - ) - first_pkys = [ - partitioned_table[primary_key] - .first() - .over(window2) - .name(consts.DVT_FIRST_PRE + primary_key) - for primary_key in self.primary_keys - ] - partition_no = ( - partitioned_table[consts.DVT_NTILE_COL] - .first() - .over(window2) - .name(consts.DVT_PART_NO) - ) - column_list = [partition_no] + first_pkys - partition_boundary = ( - partitioned_table.select(column_list) - .sort_by([consts.DVT_PART_NO]) - .distinct() + row_number = (ibis.row_number().over(window1) + 1).name(consts.DVT_POS_COL) + dvt_keys = self.primary_keys.copy() + dvt_keys.append(row_number) + rownum_table = source_table.select(dvt_keys) + # Rownum table is just the primary key columns in the source table along with + # an additional column with the row number associated with each row. + + # This rather complicated expression below is a filter (where) clause condition that filters the row numbers + # that correspond to the first element of the partition. The number of a partition is + # ceiling(row number * # of partitions / total number of rows). The first element of the partition is where + # the remainder, i.e. row number * # of partitions % total number of rows is > 0 and <= number of partitions. + # The remainder function does not work well with Teradata, hence writing that out explicitly. + cond = ( + ( + rownum_table[consts.DVT_POS_COL] * number_of_part + - ( + rownum_table[consts.DVT_POS_COL] * number_of_part / source_count + ).floor() + * source_count + ) + <= number_of_part + ) & ( + ( + rownum_table[consts.DVT_POS_COL] * number_of_part + - ( + rownum_table[consts.DVT_POS_COL] * number_of_part / source_count + ).floor() + * source_count + ) + > 0 ) + first_keys_table = rownum_table[cond].order_by(self.primary_keys) # Up until this point, we have built the table expression, have not executed the query yet. - # The query is now executed to find the first and last element of each partition - first_elements = partition_boundary.execute().to_numpy() + # The query is now executed to find the first element of each partition + first_elements = first_keys_table.execute().to_numpy() # Once we have the first element of each partition, we can generate the where clause # i.e. greater than or equal to first element and less than first element of next partition @@ -219,21 +224,27 @@ def _get_partition_key_filters(self) -> List[List[str]]: # partition and greater than or equal to the first element of the last partition respectively filter_clause_list = [] filter_clause_list.append( - self._less_than_value(self.primary_keys, first_elements[1, 1:]) + self._less_than_value( + self.primary_keys, first_elements[1, : len(self.primary_keys)] + ) ) for i in range(1, first_elements.shape[0] - 1): filter_clause_list.append( "(" - + self._geq_value(self.primary_keys, first_elements[i, 1:]) + + self._geq_value( + self.primary_keys, first_elements[i, : len(self.primary_keys)] + ) + ") AND (" + self._less_than_value( - self.primary_keys, first_elements[i + 1, 1:] + self.primary_keys, + first_elements[i + 1, : len(self.primary_keys)], ) + ")" ) filter_clause_list.append( self._geq_value( - self.primary_keys, first_elements[len(first_elements) - 1, 1:] + self.primary_keys, + first_elements[len(first_elements) - 1, : len(self.primary_keys)], ) ) diff --git a/data_validation/query_builder/partition_row_builder.py b/data_validation/query_builder/partition_row_builder.py index a14fe63ac..8f6768e1c 100644 --- a/data_validation/query_builder/partition_row_builder.py +++ b/data_validation/query_builder/partition_row_builder.py @@ -62,4 +62,4 @@ def _compile_query( def get_count(self) -> int: """Return a count of rows of primary keys - they should be all distinct""" - return self.query.select(self.primary_keys).count().execute() + return self.query[self.primary_keys].count().execute() diff --git a/tests/system/data_sources/test_bigquery.py b/tests/system/data_sources/test_bigquery.py index 144fe18f4..1b67acb08 100644 --- a/tests/system/data_sources/test_bigquery.py +++ b/tests/system/data_sources/test_bigquery.py @@ -18,6 +18,7 @@ from data_validation import __main__ as main from data_validation import cli_tools, clients, consts, data_validation, state_manager from data_validation.query_builder import random_row_builder +from data_validation.partition_builder import PartitionBuilder from data_validation.query_builder.query_builder import QueryBuilder @@ -1094,6 +1095,54 @@ def test_custom_query(): assert result_df.source_agg_value.equals(result_df.target_agg_value) +# Expected result from partitioning table on 3 keys +EXPECTED_PARTITION_FILTER = [ + "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))", + "(course_id > 'ALG001' OR course_id = 'ALG001' AND (quarter_id > 3 OR quarter_id = 3 AND (student_id >= 1234)))" + + " AND (course_id < 'GEO001' OR course_id = 'GEO001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 5678)))", + "(course_id > 'GEO001' OR course_id = 'GEO001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 5678)))" + + " AND (course_id < 'TRI001' OR course_id = 'TRI001' AND (quarter_id < 1 OR quarter_id = 1 AND (student_id < 9012)))", + "course_id > 'TRI001' OR course_id = 'TRI001' AND (quarter_id > 1 OR quarter_id = 1 AND (student_id >= 9012))", +] + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + return_value=BQ_CONN, +) +def test_bigquery_generate_table_partitions(mock_conn): + """Test generate table partitions on BigQuery + The unit tests, specifically test_add_partition_filters_to_config and test_store_yaml_partitions_local + check that yaml configurations are created and saved in local storage. Partitions can only be created with + a database that can handle SQL with ntile, hence doing this as part of system testing. + What we are checking + 1. the shape of the partition list is 1, number of partitions (only one table in the list) + 2. value of the partition list matches what we expect. + """ + parser = cli_tools.configure_arg_parser() + args = parser.parse_args( + [ + "generate-table-partitions", + "-sc=mock-conn", + "-tc=mock-conn", + "-tbls=pso_data_validator.test_generate_partitions=pso_data_validator.test_generate_partitions", + "-pk=course_id,quarter_id,student_id", + "-hash=*", + "-cdir=/home/users/yaml", + "-pn=4", + ] + ) + config_managers = main.build_config_managers_from_args(args, consts.ROW_VALIDATION) + partition_builder = PartitionBuilder(config_managers, args) + partition_filters = partition_builder._get_partition_key_filters() + + assert len(partition_filters) == 1 # only one pair of tables + assert ( + len(partition_filters[0]) == partition_builder.args.partition_num + ) # assume no of table rows > partition_num + assert partition_filters[0] == EXPECTED_PARTITION_FILTER + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", return_value=BQ_CONN, diff --git a/tests/system/data_sources/test_hive.py b/tests/system/data_sources/test_hive.py index 8976829b3..050fe3638 100644 --- a/tests/system/data_sources/test_hive.py +++ b/tests/system/data_sources/test_hive.py @@ -17,6 +17,7 @@ from data_validation import __main__ as main from data_validation import cli_tools, data_validation, consts +from data_validation.partition_builder import PartitionBuilder from tests.system.data_sources.test_bigquery import BQ_CONN @@ -98,6 +99,54 @@ def disabled_test_schema_validation_core_types(): assert len(df) == 0 +# Expected result from partitioning table on 3 keys +EXPECTED_PARTITION_FILTER = [ + "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))", + "(course_id > 'ALG001' OR course_id = 'ALG001' AND (quarter_id > 3 OR quarter_id = 3 AND (student_id >= 1234)))" + + " AND (course_id < 'GEO001' OR course_id = 'GEO001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 5678)))", + "(course_id > 'GEO001' OR course_id = 'GEO001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 5678)))" + + " AND (course_id < 'TRI001' OR course_id = 'TRI001' AND (quarter_id < 1 OR quarter_id = 1 AND (student_id < 9012)))", + "course_id > 'TRI001' OR course_id = 'TRI001' AND (quarter_id > 1 OR quarter_id = 1 AND (student_id >= 9012))", +] + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_bigquery_generate_table_partitions(): + """Test generate table partitions on BigQuery + The unit tests, specifically test_add_partition_filters_to_config and test_store_yaml_partitions_local + check that yaml configurations are created and saved in local storage. Partitions can only be created with + a database that can handle SQL with ntile, hence doing this as part of system testing. + What we are checking + 1. the shape of the partition list is 1, number of partitions (only one table in the list) + 2. value of the partition list matches what we expect. + """ + parser = cli_tools.configure_arg_parser() + args = parser.parse_args( + [ + "generate-table-partitions", + "-sc=hive-conn", + "-tc=hive-conn", + "-tbls=pso_data_validator.test_generate_partitions=pso_data_validator.test_generate_partitions", + "-pk=course_id,quarter_id,student_id", + "-hash=*", + "-cdir=/home/users/yaml", + "-pn=4", + ] + ) + config_managers = main.build_config_managers_from_args(args, consts.ROW_VALIDATION) + partition_builder = PartitionBuilder(config_managers, args) + partition_filters = partition_builder._get_partition_key_filters() + + assert len(partition_filters) == 1 # only one pair of tables + assert ( + len(partition_filters[0]) == partition_builder.args.partition_num + ) # assume no of table rows > partition_num + assert partition_filters[0] == EXPECTED_PARTITION_FILTER + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, diff --git a/tests/system/data_sources/test_mysql.py b/tests/system/data_sources/test_mysql.py index b670f9222..bb61b8b43 100644 --- a/tests/system/data_sources/test_mysql.py +++ b/tests/system/data_sources/test_mysql.py @@ -17,7 +17,7 @@ from data_validation import __main__ as main from data_validation import cli_tools, data_validation, consts, exceptions - +from data_validation.partition_builder import PartitionBuilder MYSQL_HOST = os.getenv("MYSQL_HOST", "localhost") MYSQL_USER = os.getenv("MYSQL_USER", "dvt") @@ -426,6 +426,54 @@ def test_mysql_row(): pass +# Expected result from partitioning table on 3 keys +EXPECTED_PARTITION_FILTER = [ + "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))", + "(course_id > 'ALG001' OR course_id = 'ALG001' AND (quarter_id > 3 OR quarter_id = 3 AND (student_id >= 1234)))" + + " AND (course_id < 'GEO001' OR course_id = 'GEO001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 5678)))", + "(course_id > 'GEO001' OR course_id = 'GEO001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 5678)))" + + " AND (course_id < 'TRI001' OR course_id = 'TRI001' AND (quarter_id < 1 OR quarter_id = 1 AND (student_id < 9012)))", + "course_id > 'TRI001' OR course_id = 'TRI001' AND (quarter_id > 1 OR quarter_id = 1 AND (student_id >= 9012))", +] + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + return_value=CONN, +) +def test_mysql_generate_table_partitions(mock_conn): + """Test generate table partitions on mysql + The unit tests, specifically test_add_partition_filters_to_config and test_store_yaml_partitions_local + check that yaml configurations are created and saved in local storage. Partitions can only be created with + a database that can handle SQL with ntile, hence doing this as part of system testing. + What we are checking + 1. the shape of the partition list is 1, number of partitions (only one table in the list) + 2. value of the partition list matches what we expect. + """ + parser = cli_tools.configure_arg_parser() + args = parser.parse_args( + [ + "generate-table-partitions", + "-sc=mock-conn", + "-tc=mock-conn", + "-tbls=pso_data_validator.test_generate_partitions=pso_data_validator.test_generate_partitions", + "-pk=course_id,quarter_id,student_id", + "-hash=*", + "-cdir=/home/users/yaml", + "-pn=4", + ] + ) + config_managers = main.build_config_managers_from_args(args, consts.ROW_VALIDATION) + partition_builder = PartitionBuilder(config_managers, args) + partition_filters = partition_builder._get_partition_key_filters() + + assert len(partition_filters) == 1 # only one pair of tables + assert ( + len(partition_filters[0]) == partition_builder.args.partition_num + ) # assume no of table rows > partition_num + assert partition_filters[0] == EXPECTED_PARTITION_FILTER + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", return_value=CONN, diff --git a/tests/system/data_sources/test_oracle.py b/tests/system/data_sources/test_oracle.py index 22df61355..321d8198e 100644 --- a/tests/system/data_sources/test_oracle.py +++ b/tests/system/data_sources/test_oracle.py @@ -17,6 +17,7 @@ from data_validation import __main__ as main from data_validation import cli_tools, data_validation, consts +from data_validation.partition_builder import PartitionBuilder from tests.system.data_sources.test_bigquery import BQ_CONN @@ -70,6 +71,54 @@ def mock_get_connection_config(*args): return BQ_CONN +# Expected result from partitioning table on 3 keys +EXPECTED_PARTITION_FILTER = [ + "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))", + "(course_id > 'ALG001' OR course_id = 'ALG001' AND (quarter_id > 3 OR quarter_id = 3 AND (student_id >= 1234)))" + + " AND (course_id < 'GEO001' OR course_id = 'GEO001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 5678)))", + "(course_id > 'GEO001' OR course_id = 'GEO001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 5678)))" + + " AND (course_id < 'TRI001' OR course_id = 'TRI001' AND (quarter_id < 1 OR quarter_id = 1 AND (student_id < 9012)))", + "course_id > 'TRI001' OR course_id = 'TRI001' AND (quarter_id > 1 OR quarter_id = 1 AND (student_id >= 9012))", +] + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_oracle_generate_table_partitions(): + """Test generate table partitions on Oracle + The unit tests, specifically test_add_partition_filters_to_config and test_store_yaml_partitions_local + check that yaml configurations are created and saved in local storage. Partitions can only be created with + a database that can handle SQL with ntile, hence doing this as part of system testing. + What we are checking + 1. the shape of the partition list is 1, number of partitions (only one table in the list) + 2. value of the partition list matches what we expect. + """ + parser = cli_tools.configure_arg_parser() + args = parser.parse_args( + [ + "generate-table-partitions", + "-sc=mock-conn", + "-tc=mock-conn", + "-tbls=pso_data_validator.test_generate_partitions=pso_data_validator.test_generate_partitions", + "-pk=course_id,quarter_id,student_id", + "-hash=*", + "-cdir=/home/users/yaml", + "-pn=4", + ] + ) + config_managers = main.build_config_managers_from_args(args, consts.ROW_VALIDATION) + partition_builder = PartitionBuilder(config_managers, args) + partition_filters = partition_builder._get_partition_key_filters() + + assert len(partition_filters) == 1 # only one pair of tables + assert ( + len(partition_filters[0]) == partition_builder.args.partition_num + ) # assume no of table rows > partition_num + assert partition_filters[0] == EXPECTED_PARTITION_FILTER + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, diff --git a/tests/system/data_sources/test_postgres.py b/tests/system/data_sources/test_postgres.py index 9b5a6b648..7ec8a9d49 100644 --- a/tests/system/data_sources/test_postgres.py +++ b/tests/system/data_sources/test_postgres.py @@ -454,12 +454,12 @@ def mock_get_connection_config(*args): # Expected result from partitioning table on 3 keys EXPECTED_PARTITION_FILTER = [ - "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 5678))", - "(course_id > 'ALG001' OR course_id = 'ALG001' AND (quarter_id > 3 OR quarter_id = 3 AND (student_id >= 5678)))" - + " AND (course_id < 'GEO001' OR course_id = 'GEO001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 9012)))", - "(course_id > 'GEO001' OR course_id = 'GEO001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 9012)))" - + " AND (course_id < 'TRI001' OR course_id = 'TRI001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 1234)))", - "course_id > 'TRI001' OR course_id = 'TRI001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 1234))", + "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))", + "(course_id > 'ALG001' OR course_id = 'ALG001' AND (quarter_id > 3 OR quarter_id = 3 AND (student_id >= 1234)))" + + " AND (course_id < 'GEO001' OR course_id = 'GEO001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 5678)))", + "(course_id > 'GEO001' OR course_id = 'GEO001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 5678)))" + + " AND (course_id < 'TRI001' OR course_id = 'TRI001' AND (quarter_id < 1 OR quarter_id = 1 AND (student_id < 9012)))", + "course_id > 'TRI001' OR course_id = 'TRI001' AND (quarter_id > 1 OR quarter_id = 1 AND (student_id >= 9012))", ] diff --git a/tests/system/data_sources/test_sql_server.py b/tests/system/data_sources/test_sql_server.py index eb043097f..6771e7d69 100644 --- a/tests/system/data_sources/test_sql_server.py +++ b/tests/system/data_sources/test_sql_server.py @@ -22,6 +22,7 @@ ) from data_validation import __main__ as main from data_validation import cli_tools, data_validation, consts +from data_validation.partition_builder import PartitionBuilder from tests.system.data_sources.test_bigquery import BQ_CONN @@ -199,6 +200,54 @@ def mock_get_connection_config(*args): return BQ_CONN +# Expected result from partitioning table on 3 keys +EXPECTED_PARTITION_FILTER = [ + "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))", + "(course_id > 'ALG001' OR course_id = 'ALG001' AND (quarter_id > 3 OR quarter_id = 3 AND (student_id >= 1234)))" + + " AND (course_id < 'GEO001' OR course_id = 'GEO001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 5678)))", + "(course_id > 'GEO001' OR course_id = 'GEO001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 5678)))" + + " AND (course_id < 'TRI001' OR course_id = 'TRI001' AND (quarter_id < 1 OR quarter_id = 1 AND (student_id < 9012)))", + "course_id > 'TRI001' OR course_id = 'TRI001' AND (quarter_id > 1 OR quarter_id = 1 AND (student_id >= 9012))", +] + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_sqlserver_generate_table_partitions(cloud_sql): + """Test generate table partitions on sqlserver + The unit tests, specifically test_add_partition_filters_to_config and test_store_yaml_partitions_local + check that yaml configurations are created and saved in local storage. Partitions can only be created with + a database that can handle SQL with ntile, hence doing this as part of system testing. + What we are checking + 1. the shape of the partition list is 1, number of partitions (only one table in the list) + 2. value of the partition list matches what we expect. + """ + parser = cli_tools.configure_arg_parser() + args = parser.parse_args( + [ + "generate-table-partitions", + "-sc=mock-conn", + "-tc=mock-conn", + "-tbls=dbo.test_generate_partitions=dbo.test_generate_partitions", + "-pk=course_id,quarter_id,student_id", + "-hash=*", + "-cdir=/home/users/yaml", + "-pn=4", + ] + ) + config_managers = main.build_config_managers_from_args(args, consts.ROW_VALIDATION) + partition_builder = PartitionBuilder(config_managers, args) + partition_filters = partition_builder._get_partition_key_filters() + + assert len(partition_filters) == 1 # only one pair of tables + assert ( + len(partition_filters[0]) == partition_builder.args.partition_num + ) # assume no of table rows > partition_num + assert partition_filters[0] == EXPECTED_PARTITION_FILTER + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config, diff --git a/tests/system/data_sources/test_teradata.py b/tests/system/data_sources/test_teradata.py index 5788065d4..7128859ea 100644 --- a/tests/system/data_sources/test_teradata.py +++ b/tests/system/data_sources/test_teradata.py @@ -17,6 +17,7 @@ from data_validation import __main__ as main from data_validation import cli_tools, data_validation, consts +from data_validation.partition_builder import PartitionBuilder from tests.system.data_sources.test_bigquery import BQ_CONN @@ -345,6 +346,54 @@ def test_row_validation_core_types(): assert len(df) == 0 +# Expected result from partitioning table on 3 keys +EXPECTED_PARTITION_FILTER = [ + "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))", + "(course_id > 'ALG001' OR course_id = 'ALG001' AND (quarter_id > 3 OR quarter_id = 3 AND (student_id >= 1234)))" + + " AND (course_id < 'GEO001' OR course_id = 'GEO001' AND (quarter_id < 2 OR quarter_id = 2 AND (student_id < 5678)))", + "(course_id > 'GEO001' OR course_id = 'GEO001' AND (quarter_id > 2 OR quarter_id = 2 AND (student_id >= 5678)))" + + " AND (course_id < 'TRI001' OR course_id = 'TRI001' AND (quarter_id < 1 OR quarter_id = 1 AND (student_id < 9012)))", + "course_id > 'TRI001' OR course_id = 'TRI001' AND (quarter_id > 1 OR quarter_id = 1 AND (student_id >= 9012))", +] + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, +) +def test_teradata_generate_table_partitions(): + """Test generate table partitions on BigQuery + The unit tests, specifically test_add_partition_filters_to_config and test_store_yaml_partitions_local + check that yaml configurations are created and saved in local storage. Partitions can only be created with + a database that can handle SQL with ntile, hence doing this as part of system testing. + What we are checking + 1. the shape of the partition list is 1, number of partitions (only one table in the list) + 2. value of the partition list matches what we expect. + """ + parser = cli_tools.configure_arg_parser() + args = parser.parse_args( + [ + "generate-table-partitions", + "-sc=mock-conn", + "-tc=mock-conn", + "-tbls=udf.test_generate_partitions=udf.test_generate_partitions", + "-pk=course_id,quarter_id,student_id", + "-hash=*", + "-cdir=/home/users/yaml", + "-pn=4", + ] + ) + config_managers = main.build_config_managers_from_args(args, consts.ROW_VALIDATION) + partition_builder = PartitionBuilder(config_managers, args) + partition_filters = partition_builder._get_partition_key_filters() + + assert len(partition_filters) == 1 # only one pair of tables + assert ( + len(partition_filters[0]) == partition_builder.args.partition_num + ) # assume no of table rows > partition_num + assert partition_filters[0] == EXPECTED_PARTITION_FILTER + + @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", new=mock_get_connection_config,