Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preprocessed distinct grouping mode #1342

Merged
merged 7 commits into from
May 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions db/functions/operations/deserialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_db_function_from_ma_function_spec(spec):
"""
try:
db_function_subclass_id, raw_parameters = get_raw_spec_components(spec)
db_function_subclass = _get_db_function_subclass_by_id(db_function_subclass_id)
db_function_subclass = get_db_function_subclass_by_id(db_function_subclass_id)
parameters = [
_process_parameter(
parameter=raw_parameter,
Expand Down Expand Up @@ -58,7 +58,7 @@ def _process_parameter(parameter, parent_db_function_subclass):
)


def _get_db_function_subclass_by_id(subclass_id):
def get_db_function_subclass_by_id(subclass_id):
for db_function_subclass in known_db_functions:
if db_function_subclass.id == subclass_id:
return db_function_subclass
Expand Down
32 changes: 29 additions & 3 deletions db/records/operations/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
from sqlalchemy import select, func, and_, case, literal, cast, TEXT

from db.functions.operations.deserialize import get_db_function_subclass_by_id
from db.records import exceptions as records_exceptions
from db.records.operations import calculation
from db.records.utils import create_col_objects
Expand Down Expand Up @@ -37,6 +38,7 @@ def __init__(
self,
columns,
mode=GroupMode.DISTINCT.value,
preproc=None,
num_groups=None,
bound_tuples=None,
count_by=None,
Expand All @@ -46,6 +48,12 @@ def __init__(
):
self._columns = tuple(columns) if type(columns) != str else tuple([columns])
self._mode = mode
if type(preproc) == str:
self._preproc = tuple([preproc])
elif preproc is not None:
self._preproc = tuple(preproc)
else:
self._preproc = None
self._num_groups = num_groups
self._bound_tuples = bound_tuples
self._count_by = count_by
Expand All @@ -63,6 +71,10 @@ def columns(self):
def mode(self):
return self._mode

@property
def preproc(self):
return self._preproc

@property
def num_groups(self):
return self._num_groups
Expand Down Expand Up @@ -107,6 +119,11 @@ def validate(self):
f'mode "{self.mode}" is invalid. valid modes are: '
+ ', '.join([f"'{gm}'" for gm in group_modes])
)
elif self.preproc is not None and len(self.preproc) != len(self.columns):
raise records_exceptions.BadGroupFormat(
'preproc must be same length as columns if given'
)

elif (
self.mode == GroupMode.PERCENTILE.value
and not type(self.num_groups) == int
Expand Down Expand Up @@ -207,21 +224,30 @@ def get_group_augmented_records_query(table, group_by):
elif group_by.mode == GroupMode.MAGNITUDE.value:
query = _get_tens_powers_range_group_select(table, grouping_columns)
elif group_by.mode == GroupMode.DISTINCT.value:
query = _get_distinct_group_select(table, grouping_columns)
query = _get_distinct_group_select(table, grouping_columns, group_by.preproc)
elif group_by.mode == GroupMode.PREFIX.value:
query = _get_prefix_group_select(table, grouping_columns, group_by.prefix_length)
else:
raise records_exceptions.BadGroupFormat("Unknown error")
return query


def _get_distinct_group_select(table, grouping_columns):
def _get_distinct_group_select(table, grouping_columns, preproc):
window_def = GroupingWindowDefinition(
order_by=grouping_columns, partition_by=grouping_columns
)

if preproc is not None:
processed_columns = [
get_db_function_subclass_by_id(proc).to_sa_expression(col)
for proc, col in zip(preproc, grouping_columns)
if proc is not None
]
else:
processed_columns = grouping_columns

group_id_expr = func.dense_rank().over(
order_by=window_def.order_by, range_=window_def.range_
order_by=processed_columns, range_=window_def.range_
)
return select(
table,
Expand Down
53 changes: 52 additions & 1 deletion db/tests/records/operations/test_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

from db.records.operations import group
from db.records import exceptions as records_exceptions
from db.tests.types import fixtures

engine_with_types = fixtures.engine_with_types
uris_table_obj = fixtures.uris_table_obj


@pytest.fixture
Expand All @@ -11,7 +15,7 @@ def roster_distinct_setup(roster_table_obj):
input_cols = ['Student Number', 'Student Email']
gb = group.GroupBy(columns=input_cols)
grouping_columns = gb.get_validated_group_by_columns(roster)
sel = group._get_distinct_group_select(roster, grouping_columns)
sel = group._get_distinct_group_select(roster, grouping_columns, None)
with engine.begin() as conn:
res = conn.execute(sel).fetchall()
return res
Expand Down Expand Up @@ -327,6 +331,45 @@ def test_smoke_get_group_augmented_records_query_prefix(roster_table_obj):
)


def test_smoke_get_group_augmented_records_query_email_preproc(roster_table_obj):
roster, engine = roster_table_obj
group_by = group.GroupBy(
['Student Email'],
mode=group.GroupMode.DISTINCT.value,
preproc=['extract_email_domain']
)
augmented_query = group.get_group_augmented_records_query(roster, group_by)
with engine.begin() as conn:
res = conn.execute(augmented_query).fetchall()
for row in res:
assert all(
[
metadata_field.value in row[group.MATHESAR_GROUP_METADATA]
for metadata_field in group.GroupMetadataField
]
)


@pytest.mark.parametrize('preproc', ['extract_uri_authority', 'extract_uri_scheme'])
def test_smoke_get_group_augmented_records_query_uris_preproc(uris_table_obj, preproc):
roster, engine = uris_table_obj
group_by = group.GroupBy(
['uri'],
mode=group.GroupMode.DISTINCT.value,
preproc=[preproc]
)
augmented_query = group.get_group_augmented_records_query(roster, group_by)
with engine.begin() as conn:
res = conn.execute(augmented_query).fetchall()
for row in res:
assert all(
[
metadata_field.value in row[group.MATHESAR_GROUP_METADATA]
for metadata_field in group.GroupMetadataField
]
)


single_col_number_modes = [
group.GroupMode.MAGNITUDE.value,
group.GroupMode.COUNT_BY.value,
Expand Down Expand Up @@ -363,6 +406,14 @@ def test_smoke_get_group_augmented_records_query_magnitude(magnitude_table_obj,
),
259
),
(
group.GroupBy(
['Student Number', 'Student Email'],
mode=group.GroupMode.DISTINCT.value,
preproc=[None, 'extract_email_domain']
),
3
),
(
group.GroupBy(
['Student Number', 'Student Email'],
Expand Down
4 changes: 2 additions & 2 deletions db/tests/resources/roster_create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ CREATE TABLE "Roster" (
id integer NOT NULL,
"Student Number" uuid,
"Student Name" character varying(100),
"Student Email" character varying(150),
"Student Email" mathesar_types.email,
"Teacher" character varying(100),
"Teacher Email" character varying(150),
"Teacher Email" mathesar_types.email,
"Subject" character varying(20),
"Grade" integer
);
Expand Down
101 changes: 101 additions & 0 deletions db/tests/resources/uri_testing
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
--
-- PostgreSQL database dump
--

-- Dumped from database version 13.6 (Debian 13.6-1.pgdg110+1)
-- Dumped by pg_dump version 14.2

SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SELECT pg_catalog.set_config('search_path', '', false);
SET check_function_bodies = false;
SET xmloption = content;
SET client_min_messages = warning;
SET row_security = off;

SET default_tablespace = '';

SET default_table_access_method = heap;

--
-- Name: uri_testing; Type: TABLE; Schema: public; Owner: mathesar
--

CREATE TABLE public.uri_testing (
id integer NOT NULL,
uris mathesar_types.uri
);


ALTER TABLE public.uri_testing OWNER TO mathesar;

--
-- Name: uri_testing_id_seq; Type: SEQUENCE; Schema: public; Owner: mathesar
--

CREATE SEQUENCE public.uri_testing_id_seq
AS integer
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;


ALTER TABLE public.uri_testing_id_seq OWNER TO mathesar;

--
-- Name: uri_testing_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: mathesar
--

ALTER SEQUENCE public.uri_testing_id_seq OWNED BY public.uri_testing.id;


--
-- Name: uri_testing id; Type: DEFAULT; Schema: public; Owner: mathesar
--

ALTER TABLE ONLY public.uri_testing ALTER COLUMN id SET DEFAULT nextval('public.uri_testing_id_seq'::regclass);


--
-- Data for Name: uri_testing; Type: TABLE DATA; Schema: public; Owner: mathesar
--

COPY public.uri_testing (id, uris) FROM stdin;
1 https://google.com
2 https://yahoo.com
3 https://github.com/centerofci/mathesar/issues?q=is%3Aissue+is%3Aopen+group
4 https://github.com/centerofci/mathesar/issues?q=is%3Aissue+group+is%3Aclosed
5 https://github.com/centerofci/mathesar/
6 ftp://ftp.example.com/path/to/RFC/rfc959.txt
7 sftp://ftp.example.com/path/to/RFC/rfc959.txt
8 http://google.com
9 http://yahoo.com
10 http://github.com/centerofci/mathesar/issues?q=is%3Aissue+is%3Aopen+group
11 http://github.com/centerofci/mathesar/issues?q=is%3Aissue+group+is%3Aclosed
\.


--
-- Name: uri_testing_id_seq; Type: SEQUENCE SET; Schema: public; Owner: mathesar
--

SELECT pg_catalog.setval('public.uri_testing_id_seq', 11, true);


--
-- Name: uri_testing uri_testing_pkey; Type: CONSTRAINT; Schema: public; Owner: mathesar
--

ALTER TABLE ONLY public.uri_testing
ADD CONSTRAINT uri_testing_pkey PRIMARY KEY (id);


--
-- PostgreSQL database dump complete
--

1 change: 1 addition & 0 deletions mathesar/api/pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def paginate_queryset(
'count_by': group_by.count_by,
'global_min': group_by.global_min,
'global_max': group_by.global_max,
'preproc': group_by.preproc,
'prefix_length': group_by.prefix_length,
'ranged': group_by.ranged,
'groups': groups,
Expand Down
58 changes: 57 additions & 1 deletion mathesar/tests/api/test_record_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,56 @@ def test_record_list_sort(create_table, client):
},
],
),
(
'Count By Grouping',
{
'columns': ['id'],
'mode': 'count_by',
'global_min': 0,
'global_max': 1000,
'count_by': 50
},
[
{
'count': 49,
'first_value': {'id': 1},
'last_value': {'id': 49},
'less_than_eq_value': None,
'greater_than_eq_value': {'id': 0},
'less_than_value': {'id': 50},
'greater_than_value': None,
'result_indices': [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
47, 48
]
}, {
'count': 50,
'first_value': {'id': 50},
'last_value': {'id': 99},
'less_than_eq_value': None,
'greater_than_eq_value': {'id': 50},
'less_than_value': {'id': 100},
'greater_than_value': None,
'result_indices': [
49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93,
94, 95, 96, 97, 98
]
}, {
'count': 50,
'first_value': {'id': 100},
'last_value': {'id': 149},
'less_than_eq_value': None,
'greater_than_eq_value': {'id': 100},
'less_than_value': {'id': 150},
'greater_than_value': None,
'result_indices': [99]
}
]
),
(
'NASA Record List Group Prefix',
{'columns': ['Case Number'], 'mode': 'prefix', 'prefix_length': 3},
Expand Down Expand Up @@ -574,21 +624,27 @@ def _test_group_equality(actual_groups, expect_groups):
else:
assert actual_item is None

def _retuple_bound_tuples(bound_tuple_list):
if bound_tuple_list is not None:
return [tuple(t) for t in grouping_dict['bound_tuples']]

assert response.status_code == 200
assert response_data['count'] == 1393
assert len(response_data['results']) == limit

group_by = GroupBy(**grouping)
grouping_dict = response_data['grouping']
print(grouping_dict)
assert grouping_dict['columns'] == [
columns_name_id_map[colname] for colname in group_by.columns
]
assert grouping_dict['mode'] == group_by.mode
assert grouping_dict['num_groups'] == group_by.num_groups
assert grouping_dict['bound_tuples'] == group_by.bound_tuples
assert _retuple_bound_tuples(grouping_dict['bound_tuples']) == group_by.bound_tuples
assert grouping_dict['count_by'] == group_by.count_by
assert grouping_dict['global_min'] == group_by.global_min
assert grouping_dict['global_max'] == group_by.global_max
assert grouping_dict['preproc'] == group_by.preproc
assert grouping_dict['prefix_length'] == group_by.prefix_length
assert grouping_dict['ranged'] == group_by.ranged
_test_group_equality(grouping_dict['groups'], expected_groups)
Expand Down