Skip to content

Commit

Permalink
Merge pull request #1342 from centerofci/preproc_grouping
Browse files Browse the repository at this point in the history
Preprocessed distinct grouping mode
  • Loading branch information
mathemancer committed May 4, 2022
2 parents c00598a + ff482a2 commit 290bb16
Show file tree
Hide file tree
Showing 7 changed files with 244 additions and 9 deletions.
4 changes: 2 additions & 2 deletions db/functions/operations/deserialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_db_function_from_ma_function_spec(spec):
"""
try:
db_function_subclass_id, raw_parameters = get_raw_spec_components(spec)
db_function_subclass = _get_db_function_subclass_by_id(db_function_subclass_id)
db_function_subclass = get_db_function_subclass_by_id(db_function_subclass_id)
parameters = [
_process_parameter(
parameter=raw_parameter,
Expand Down Expand Up @@ -58,7 +58,7 @@ def _process_parameter(parameter, parent_db_function_subclass):
)


def _get_db_function_subclass_by_id(subclass_id):
def get_db_function_subclass_by_id(subclass_id):
for db_function_subclass in known_db_functions:
if db_function_subclass.id == subclass_id:
return db_function_subclass
Expand Down
32 changes: 29 additions & 3 deletions db/records/operations/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
from sqlalchemy import select, func, and_, case, literal, cast, TEXT

from db.functions.operations.deserialize import get_db_function_subclass_by_id
from db.records import exceptions as records_exceptions
from db.records.operations import calculation
from db.records.utils import create_col_objects
Expand Down Expand Up @@ -37,6 +38,7 @@ def __init__(
self,
columns,
mode=GroupMode.DISTINCT.value,
preproc=None,
num_groups=None,
bound_tuples=None,
count_by=None,
Expand All @@ -46,6 +48,12 @@ def __init__(
):
self._columns = tuple(columns) if type(columns) != str else tuple([columns])
self._mode = mode
if type(preproc) == str:
self._preproc = tuple([preproc])
elif preproc is not None:
self._preproc = tuple(preproc)
else:
self._preproc = None
self._num_groups = num_groups
self._bound_tuples = bound_tuples
self._count_by = count_by
Expand All @@ -63,6 +71,10 @@ def columns(self):
def mode(self):
return self._mode

@property
def preproc(self):
return self._preproc

@property
def num_groups(self):
return self._num_groups
Expand Down Expand Up @@ -107,6 +119,11 @@ def validate(self):
f'mode "{self.mode}" is invalid. valid modes are: '
+ ', '.join([f"'{gm}'" for gm in group_modes])
)
elif self.preproc is not None and len(self.preproc) != len(self.columns):
raise records_exceptions.BadGroupFormat(
'preproc must be same length as columns if given'
)

elif (
self.mode == GroupMode.PERCENTILE.value
and not type(self.num_groups) == int
Expand Down Expand Up @@ -207,21 +224,30 @@ def get_group_augmented_records_query(table, group_by):
elif group_by.mode == GroupMode.MAGNITUDE.value:
query = _get_tens_powers_range_group_select(table, grouping_columns)
elif group_by.mode == GroupMode.DISTINCT.value:
query = _get_distinct_group_select(table, grouping_columns)
query = _get_distinct_group_select(table, grouping_columns, group_by.preproc)
elif group_by.mode == GroupMode.PREFIX.value:
query = _get_prefix_group_select(table, grouping_columns, group_by.prefix_length)
else:
raise records_exceptions.BadGroupFormat("Unknown error")
return query


def _get_distinct_group_select(table, grouping_columns):
def _get_distinct_group_select(table, grouping_columns, preproc):
window_def = GroupingWindowDefinition(
order_by=grouping_columns, partition_by=grouping_columns
)

if preproc is not None:
processed_columns = [
get_db_function_subclass_by_id(proc).to_sa_expression(col)
for proc, col in zip(preproc, grouping_columns)
if proc is not None
]
else:
processed_columns = grouping_columns

group_id_expr = func.dense_rank().over(
order_by=window_def.order_by, range_=window_def.range_
order_by=processed_columns, range_=window_def.range_
)
return select(
table,
Expand Down
53 changes: 52 additions & 1 deletion db/tests/records/operations/test_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

from db.records.operations import group
from db.records import exceptions as records_exceptions
from db.tests.types import fixtures

engine_with_types = fixtures.engine_with_types
uris_table_obj = fixtures.uris_table_obj


@pytest.fixture
Expand All @@ -11,7 +15,7 @@ def roster_distinct_setup(roster_table_obj):
input_cols = ['Student Number', 'Student Email']
gb = group.GroupBy(columns=input_cols)
grouping_columns = gb.get_validated_group_by_columns(roster)
sel = group._get_distinct_group_select(roster, grouping_columns)
sel = group._get_distinct_group_select(roster, grouping_columns, None)
with engine.begin() as conn:
res = conn.execute(sel).fetchall()
return res
Expand Down Expand Up @@ -327,6 +331,45 @@ def test_smoke_get_group_augmented_records_query_prefix(roster_table_obj):
)


def test_smoke_get_group_augmented_records_query_email_preproc(roster_table_obj):
roster, engine = roster_table_obj
group_by = group.GroupBy(
['Student Email'],
mode=group.GroupMode.DISTINCT.value,
preproc=['extract_email_domain']
)
augmented_query = group.get_group_augmented_records_query(roster, group_by)
with engine.begin() as conn:
res = conn.execute(augmented_query).fetchall()
for row in res:
assert all(
[
metadata_field.value in row[group.MATHESAR_GROUP_METADATA]
for metadata_field in group.GroupMetadataField
]
)


@pytest.mark.parametrize('preproc', ['extract_uri_authority', 'extract_uri_scheme'])
def test_smoke_get_group_augmented_records_query_uris_preproc(uris_table_obj, preproc):
roster, engine = uris_table_obj
group_by = group.GroupBy(
['uri'],
mode=group.GroupMode.DISTINCT.value,
preproc=[preproc]
)
augmented_query = group.get_group_augmented_records_query(roster, group_by)
with engine.begin() as conn:
res = conn.execute(augmented_query).fetchall()
for row in res:
assert all(
[
metadata_field.value in row[group.MATHESAR_GROUP_METADATA]
for metadata_field in group.GroupMetadataField
]
)


single_col_number_modes = [
group.GroupMode.MAGNITUDE.value,
group.GroupMode.COUNT_BY.value,
Expand Down Expand Up @@ -363,6 +406,14 @@ def test_smoke_get_group_augmented_records_query_magnitude(magnitude_table_obj,
),
259
),
(
group.GroupBy(
['Student Number', 'Student Email'],
mode=group.GroupMode.DISTINCT.value,
preproc=[None, 'extract_email_domain']
),
3
),
(
group.GroupBy(
['Student Number', 'Student Email'],
Expand Down
4 changes: 2 additions & 2 deletions db/tests/resources/roster_create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ CREATE TABLE "Roster" (
id integer NOT NULL,
"Student Number" uuid,
"Student Name" character varying(100),
"Student Email" character varying(150),
"Student Email" mathesar_types.email,
"Teacher" character varying(100),
"Teacher Email" character varying(150),
"Teacher Email" mathesar_types.email,
"Subject" character varying(20),
"Grade" integer
);
Expand Down
101 changes: 101 additions & 0 deletions db/tests/resources/uri_testing
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
--
-- PostgreSQL database dump
--

-- Dumped from database version 13.6 (Debian 13.6-1.pgdg110+1)
-- Dumped by pg_dump version 14.2

SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET xmloption = content;
SET client_min_messages = warning;
SET row_security = off;

SET default_tablespace = '';

SET default_table_access_method = heap;

--
-- Name: uri_testing; Type: TABLE; Schema: public; Owner: mathesar
--

CREATE TABLE public.uri_testing (
id integer NOT NULL,
uris mathesar_types.uri
);


ALTER TABLE public.uri_testing OWNER TO mathesar;

--
-- Name: uri_testing_id_seq; Type: SEQUENCE; Schema: public; Owner: mathesar
--

CREATE SEQUENCE public.uri_testing_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;


ALTER TABLE public.uri_testing_id_seq OWNER TO mathesar;

--
-- Name: uri_testing_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: mathesar
--

ALTER SEQUENCE public.uri_testing_id_seq OWNED BY public.uri_testing.id;


--
-- Name: uri_testing id; Type: DEFAULT; Schema: public; Owner: mathesar
--

ALTER TABLE ONLY public.uri_testing ALTER COLUMN id SET DEFAULT nextval('public.uri_testing_id_seq'::regclass);


--
-- Data for Name: uri_testing; Type: TABLE DATA; Schema: public; Owner: mathesar
--

COPY public.uri_testing (id, uris) FROM stdin;
1 https://google.com
2 https://yahoo.com
3 https://github.com/centerofci/mathesar/issues?q=is%3Aissue+is%3Aopen+group
4 https://github.com/centerofci/mathesar/issues?q=is%3Aissue+group+is%3Aclosed
5 https://github.com/centerofci/mathesar/
6 ftp://ftp.example.com/path/to/RFC/rfc959.txt
7 sftp://ftp.example.com/path/to/RFC/rfc959.txt
8 http://google.com
9 http://yahoo.com
10 http://github.com/centerofci/mathesar/issues?q=is%3Aissue+is%3Aopen+group
11 http://github.com/centerofci/mathesar/issues?q=is%3Aissue+group+is%3Aclosed
\.


--
-- Name: uri_testing_id_seq; Type: SEQUENCE SET; Schema: public; Owner: mathesar
--

SELECT pg_catalog.setval('public.uri_testing_id_seq', 11, true);


--
-- Name: uri_testing uri_testing_pkey; Type: CONSTRAINT; Schema: public; Owner: mathesar
--

ALTER TABLE ONLY public.uri_testing
ADD CONSTRAINT uri_testing_pkey PRIMARY KEY (id);


--
-- PostgreSQL database dump complete
--

1 change: 1 addition & 0 deletions mathesar/api/pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def paginate_queryset(
'count_by': group_by.count_by,
'global_min': group_by.global_min,
'global_max': group_by.global_max,
'preproc': group_by.preproc,
'prefix_length': group_by.prefix_length,
'ranged': group_by.ranged,
'groups': groups,
Expand Down
58 changes: 57 additions & 1 deletion mathesar/tests/api/test_record_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,56 @@ def test_record_list_sort(create_table, client):
},
],
),
(
'Count By Grouping',
{
'columns': ['id'],
'mode': 'count_by',
'global_min': 0,
'global_max': 1000,
'count_by': 50
},
[
{
'count': 49,
'first_value': {'id': 1},
'last_value': {'id': 49},
'less_than_eq_value': None,
'greater_than_eq_value': {'id': 0},
'less_than_value': {'id': 50},
'greater_than_value': None,
'result_indices': [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
47, 48
]
}, {
'count': 50,
'first_value': {'id': 50},
'last_value': {'id': 99},
'less_than_eq_value': None,
'greater_than_eq_value': {'id': 50},
'less_than_value': {'id': 100},
'greater_than_value': None,
'result_indices': [
49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93,
94, 95, 96, 97, 98
]
}, {
'count': 50,
'first_value': {'id': 100},
'last_value': {'id': 149},
'less_than_eq_value': None,
'greater_than_eq_value': {'id': 100},
'less_than_value': {'id': 150},
'greater_than_value': None,
'result_indices': [99]
}
]
),
(
'NASA Record List Group Prefix',
{'columns': ['Case Number'], 'mode': 'prefix', 'prefix_length': 3},
Expand Down Expand Up @@ -574,21 +624,27 @@ def _test_group_equality(actual_groups, expect_groups):
else:
assert actual_item is None

def _retuple_bound_tuples(bound_tuple_list):
if bound_tuple_list is not None:
return [tuple(t) for t in grouping_dict['bound_tuples']]

assert response.status_code == 200
assert response_data['count'] == 1393
assert len(response_data['results']) == limit

group_by = GroupBy(**grouping)
grouping_dict = response_data['grouping']
print(grouping_dict)
assert grouping_dict['columns'] == [
columns_name_id_map[colname] for colname in group_by.columns
]
assert grouping_dict['mode'] == group_by.mode
assert grouping_dict['num_groups'] == group_by.num_groups
assert grouping_dict['bound_tuples'] == group_by.bound_tuples
assert _retuple_bound_tuples(grouping_dict['bound_tuples']) == group_by.bound_tuples
assert grouping_dict['count_by'] == group_by.count_by
assert grouping_dict['global_min'] == group_by.global_min
assert grouping_dict['global_max'] == group_by.global_max
assert grouping_dict['preproc'] == group_by.preproc
assert grouping_dict['prefix_length'] == group_by.prefix_length
assert grouping_dict['ranged'] == group_by.ranged
_test_group_equality(grouping_dict['groups'], expected_groups)
Expand Down

0 comments on commit 290bb16

Please sign in to comment.