Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rj/col pruning #704

Merged
merged 4 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ set(P3_FILES
"src/optimizer/sort_limit_as_topn.cpp"
"src/optimizer/optimizer_internal.cpp"
"src/optimizer/seqscan_as_indexscan.cpp"
"src/optimizer/column_pruning.cpp"
"src/common/bustub_ddl.cpp"
"src/include/execution/plans/topn_per_group_plan.h"
${P2_FILES}
Expand Down
15 changes: 15 additions & 0 deletions src/common/util/string_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ void StringUtil::RTrim(std::string *str) {
str->erase(std::find_if(str->rbegin(), str->rend(), [](int ch) { return std::isspace(ch) == 0; }).base(), str->end());
}

void StringUtil::LTrim(std::string *str) {
// remove leading ' ', \f, \n, \r, \t, \v
str->erase(str->begin(), std::find_if(str->begin(), str->end(), [](int ch) { return std::isspace(ch) == 0; }));
}

auto StringUtil::Indent(int num_indent) -> std::string { return std::string(num_indent, ' '); } // NOLINT

auto StringUtil::StartsWith(const std::string &str, const std::string &prefix) -> bool {
Expand Down Expand Up @@ -200,6 +205,16 @@ auto StringUtil::Split(const std::string &input, const std::string &split) -> st
return splits;
}

auto StringUtil::Count(const std::string &input, const std::string &str) -> size_t {
size_t count = 0;
size_t n_pos = input.find(str, 0); // first occurrence
while (n_pos != std::string::npos) {
count++;
n_pos = input.find(str, n_pos + 1);
}
return count;
}

auto StringUtil::Strip(const std::string &str, char c) -> std::string {
// There's a copy here which is wasteful, so don't use this in performance-critical code!
std::string tmp = str;
Expand Down
35 changes: 29 additions & 6 deletions src/execution/mock_scan_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ static const char *ta_list_2023_fall[] = {"skyzh", "yliang412", "ferna
"anurag-23", "Mayank-Baranwal", "abigalekim", "ChaosZhai",
"aoleizhou", "averyqi115", "kswim8"};

static const char *ta_list_2024[] = {"AlSchlo", "walkingcabbages", "averyqi115", "lanlou1554", "sweetsuro",
"ChaosZhai", "SDTheSlayer", "xx01cyx", "yliang412", "thelongmarch-azx"};

static const char *ta_oh_2022[] = {"Tuesday", "Wednesday", "Monday", "Wednesday", "Thursday", "Friday",
"Wednesday", "Randomly", "Tuesday", "Monday", "Tuesday"};

Expand All @@ -42,12 +45,15 @@ static const char *ta_oh_2023[] = {"Friday", "Thursday", "Tuesday", "Monday",
static const char *ta_oh_2023_fall[] = {"Randomly", "Tuesday", "Wednesday", "Tuesday", "Thursday", "Tuesday",
"Friday", "Yesterday", "Friday", "Friday", "Never"};

static const char *ta_oh_2024[] = {"Friday", "Thursday", "Friday", "Wednesday", "Thursday",
"Yesterday", "Monday", "Tuesday", "Tuesday", "Monday"};

static const char *course_on_date[] = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"};

const char *mock_table_list[] = {"__mock_table_1", "__mock_table_2", "__mock_table_3", "__mock_table_tas_2022",
"__mock_table_tas_2023", "__mock_table_tas_2023_fall", "__mock_agg_input_small",
"__mock_agg_input_big", "__mock_table_schedule_2022", "__mock_table_schedule_2023",
"__mock_table_123", "__mock_graph",
"__mock_table_tas_2023", "__mock_table_tas_2023_fall", "__mock_table_tas_2024",
"__mock_agg_input_small", "__mock_agg_input_big", "__mock_table_schedule_2022",
"__mock_table_schedule", "__mock_table_123", "__mock_graph",
// For leaderboard Q1
"__mock_t1",
// For leaderboard Q2
Expand Down Expand Up @@ -84,11 +90,15 @@ auto GetMockTableSchemaOf(const std::string &table) -> Schema {
return Schema{std::vector{Column{"github_id", TypeId::VARCHAR, 128}, Column{"office_hour", TypeId::VARCHAR, 128}}};
}

if (table == "__mock_table_tas_2024") {
return Schema{std::vector{Column{"github_id", TypeId::VARCHAR, 128}, Column{"office_hour", TypeId::VARCHAR, 128}}};
}

if (table == "__mock_table_schedule_2022") {
return Schema{std::vector{Column{"day_of_week", TypeId::VARCHAR, 128}, Column{"has_lecture", TypeId::INTEGER}}};
}

if (table == "__mock_table_schedule_2023") {
if (table == "__mock_table_schedule") {
return Schema{std::vector{Column{"day_of_week", TypeId::VARCHAR, 128}, Column{"has_lecture", TypeId::INTEGER}}};
}

Expand Down Expand Up @@ -168,11 +178,15 @@ auto GetSizeOf(const MockScanPlanNode *plan) -> size_t {
return sizeof(ta_list_2023_fall) / sizeof(ta_list_2023_fall[0]);
}

if (table == "__mock_table_tas_2024") {
return sizeof(ta_list_2024) / sizeof(ta_list_2024[0]);
}

if (table == "__mock_table_schedule_2022") {
return sizeof(course_on_date) / sizeof(course_on_date[0]);
}

if (table == "__mock_table_schedule_2023") {
if (table == "__mock_table_schedule") {
return sizeof(course_on_date) / sizeof(course_on_date[0]);
}

Expand Down Expand Up @@ -306,6 +320,15 @@ auto GetFunctionOf(const MockScanPlanNode *plan) -> std::function<Tuple(size_t)>
};
}

if (table == "__mock_table_tas_2024") {
return [plan](size_t cursor) {
std::vector<Value> values{};
values.push_back(ValueFactory::GetVarcharValue(ta_list_2024[cursor]));
values.push_back(ValueFactory::GetVarcharValue(ta_oh_2024[cursor]));
return Tuple{values, &plan->OutputSchema()};
};
}

if (table == "__mock_table_schedule_2022") {
return [plan](size_t cursor) {
std::vector<Value> values{};
Expand All @@ -315,7 +338,7 @@ auto GetFunctionOf(const MockScanPlanNode *plan) -> std::function<Tuple(size_t)>
};
}

if (table == "__mock_table_schedule_2023") {
if (table == "__mock_table_schedule") {
return [plan](size_t cursor) {
std::vector<Value> values{};
values.push_back(ValueFactory::GetVarcharValue(course_on_date[cursor]));
Expand Down
9 changes: 9 additions & 0 deletions src/include/common/util/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,21 @@ class StringUtil {
/** @return input string split based on the split string */
static auto Split(const std::string &input, const std::string &split) -> std::vector<std::string>;

/** @return count occurrence of specified string in input string */
static auto Count(const std::string &input, const std::string &str) -> size_t;

/**
* Removes the whitespace characters from the right side of the string.
* @param[in,out] str string to be trimmed on the right
*/
static void RTrim(std::string *str);

/**
* Removes the whitespace characters from the left side of the string.
* @param[in,out] str string to be trimmed on the left
*/
static void LTrim(std::string *str);

/** @return indented string */
static auto Indent(int num_indent) -> std::string;

Expand Down
7 changes: 7 additions & 0 deletions src/include/optimizer/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ class Optimizer {
auto MatchIndex(const std::string &table_name, uint32_t index_key_idx)
-> std::optional<std::tuple<index_oid_t, std::string>>;

/**
* @brief column pruning for child plan following a projection plan
* @param plan the plan to optimize
* @return the new plan with column pruning
*/
auto OptimizeColumnPruning(const AbstractPlanNodeRef &plan) -> AbstractPlanNodeRef;

/**
* @brief optimize sort + limit as top N
*/
Expand Down
3 changes: 2 additions & 1 deletion src/optimizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ add_library(
optimizer_internal.cpp
order_by_index_scan.cpp
sort_limit_as_topn.cpp
seqscan_as_indexscan.cpp)
seqscan_as_indexscan.cpp
column_pruning.cpp)

set(ALL_OBJECT_FILES
${ALL_OBJECT_FILES} $<TARGET_OBJECTS:bustub_optimizer>
Expand Down
13 changes: 13 additions & 0 deletions src/optimizer/column_pruning.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "optimizer/optimizer.h"

namespace bustub {

/**
* @note You may use this function to implement column pruning optimization.
*/
auto Optimizer::OptimizeColumnPruning(const bustub::AbstractPlanNodeRef &plan) -> AbstractPlanNodeRef {
// Your code here
return plan;
}

} // namespace bustub
2 changes: 1 addition & 1 deletion test/sql/p0.03-string-scan.slt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
query rowsort
select day_of_week, upper(day_of_week), lower(day_of_week), has_lecture from __mock_table_schedule_2023;
select day_of_week, upper(day_of_week), lower(day_of_week), has_lecture from __mock_table_schedule;
----
Monday MONDAY monday 1
Tuesday TUESDAY tuesday 0
Expand Down
21 changes: 10 additions & 11 deletions test/sql/p3.00-primer.slt
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
query rowsort
select github_id, office_hour from __mock_table_tas_2023_fall;
select github_id, office_hour from __mock_table_tas_2024;
----
skyzh Randomly
yliang412 Tuesday
fernandolis10 Wednesday
wiam8 Tuesday
anurag-23 Thursday
Mayank-Baranwal Tuesday
abigalekim Friday
ChaosZhai Yesterday
aoleizhou Friday
AlSchlo Friday
walkingcabbages Thursday
averyqi115 Friday
kswim8 Never
lanlou1554 Wednesday
sweetsuro Thursday
ChaosZhai Yesterday
SDTheSlayer Monday
xx01cyx Tuesday
yliang412 Tuesday
thelongmarch-azx Monday
6 changes: 3 additions & 3 deletions test/sql/p3.07-simple-agg.slt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# 4 pts

# How many TAs are there in 2023 Fall?
# How many TAs are there in 2024 Spring?
query
select count(*) from __mock_table_tas_2023_fall;
select count(*) from __mock_table_tas_2024;
----
11
10

# The real test process begins...

Expand Down
11 changes: 5 additions & 6 deletions test/sql/p3.08-group-agg-1.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
# "rowsort" means that the order of result doesn't matter.

query rowsort
select office_hour, count(*) from __mock_table_tas_2023_fall group by office_hour;
select office_hour, count(*) from __mock_table_tas_2024 group by office_hour;
----
Never 1
Tuesday 2
Friday 2
Monday 2
Yesterday 1
Friday 3
Thursday 1
Wednesday 1
Tuesday 3
Randomly 1
Thursday 2

# The real test process begins...

Expand Down
7 changes: 5 additions & 2 deletions test/sql/p3.10-simple-join.slt
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ set force_optimizer_starter_rule=yes

query rowsort
select * from
__mock_table_tas_2023_fall inner join __mock_table_schedule_2023
__mock_table_tas_2024 inner join __mock_table_schedule
on office_hour = day_of_week
where has_lecture = 1;
----
fernandolis10 Wednesday Wednesday 1
lanlou1554 Wednesday Wednesday 1
SDTheSlayer Monday Monday 1
thelongmarch-azx Monday Monday 1


# The real test begins...

Expand Down
7 changes: 5 additions & 2 deletions test/sql/p3.14-hash-join.slt
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@

query rowsort +ensure:hash_join
select * from
__mock_table_tas_2023_fall inner join __mock_table_schedule_2023
__mock_table_tas_2024 inner join __mock_table_schedule
on office_hour = day_of_week
where has_lecture = 1;
----
fernandolis10 Wednesday Wednesday 1
SDTheSlayer Monday Monday 1
thelongmarch-azx Monday Monday 1
lanlou1554 Wednesday Wednesday 1


# The real test begins...

Expand Down
54 changes: 19 additions & 35 deletions test/sql/p3.16-sort-limit.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,33 @@

# Default
query
select * from __mock_table_tas_2023_fall order by office_hour, github_id;
select * from __mock_table_tas_2024 order by office_hour, github_id;
----
abigalekim Friday
aoleizhou Friday
AlSchlo Friday
averyqi115 Friday
kswim8 Never
skyzh Randomly
anurag-23 Thursday
Mayank-Baranwal Tuesday
wiam8 Tuesday
SDTheSlayer Monday
thelongmarch-azx Monday
sweetsuro Thursday
walkingcabbages Thursday
xx01cyx Tuesday
yliang412 Tuesday
fernandolis10 Wednesday
lanlou1554 Wednesday
ChaosZhai Yesterday


# ASC
query
select * from __mock_table_tas_2023_fall order by office_hour asc, github_id desc;
select * from __mock_table_tas_2024 order by office_hour asc, github_id desc;
----
averyqi115 Friday
aoleizhou Friday
abigalekim Friday
kswim8 Never
skyzh Randomly
anurag-23 Thursday
AlSchlo Friday
thelongmarch-azx Monday
SDTheSlayer Monday
walkingcabbages Thursday
sweetsuro Thursday
yliang412 Tuesday
wiam8 Tuesday
Mayank-Baranwal Tuesday
fernandolis10 Wednesday
ChaosZhai Yesterday


query
select * from __mock_table_tas_2023_fall order by github_id desc;
----
yliang412 Tuesday
wiam8 Tuesday
skyzh Randomly
kswim8 Never
fernandolis10 Wednesday
averyqi115 Friday
aoleizhou Friday
anurag-23 Thursday
abigalekim Friday
Mayank-Baranwal Tuesday
xx01cyx Tuesday
lanlou1554 Wednesday
ChaosZhai Yesterday


Expand Down Expand Up @@ -382,7 +364,9 @@ select * from __mock_table_123, (select * from temp_1 order by colA desc limit 3
3 98 3 1394 17139
3 97 1 2273 63790

query rowsort +ensure:nlj_init_check
# remember to enable your hash join optimizer to pass this
# you could disable this ensure if you haven't implemented it yet
query rowsort +ensure:hash_join
select * from
temp_2 t2 inner join
(select colB, colD, colA, colC from temp_1 order by colB desc, colD, colA desc limit 10 ) t1
Expand Down
Loading
Loading