From 007488235efc4686f3dad87a45a39eb66fb0526e Mon Sep 17 00:00:00 2001 From: Chaos Zhai Date: Wed, 13 Mar 2024 13:16:32 -0400 Subject: [PATCH] Rj/col pruning (#704) * col prune prep files * add s24 tas :) --- CMakeLists.txt | 1 + src/common/util/string_util.cpp | 15 ++++++++ src/execution/mock_scan_executor.cpp | 35 ++++++++++++++--- src/include/common/util/string_util.h | 9 +++++ src/include/optimizer/optimizer.h | 7 ++++ src/optimizer/CMakeLists.txt | 3 +- src/optimizer/column_pruning.cpp | 13 +++++++ test/sql/p0.03-string-scan.slt | 2 +- test/sql/p3.00-primer.slt | 21 +++++------ test/sql/p3.07-simple-agg.slt | 6 +-- test/sql/p3.08-group-agg-1.slt | 11 +++--- test/sql/p3.10-simple-join.slt | 7 +++- test/sql/p3.14-hash-join.slt | 7 +++- test/sql/p3.16-sort-limit.slt | 54 ++++++++++----------------- tools/sqllogictest/sqllogictest.cpp | 34 ++++++++++++++++- 15 files changed, 157 insertions(+), 68 deletions(-) create mode 100644 src/optimizer/column_pruning.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b2c63342..901bb388c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -351,6 +351,7 @@ set(P3_FILES "src/optimizer/sort_limit_as_topn.cpp" "src/optimizer/optimizer_internal.cpp" "src/optimizer/seqscan_as_indexscan.cpp" + "src/optimizer/column_pruning.cpp" "src/common/bustub_ddl.cpp" "src/include/execution/plans/topn_per_group_plan.h" ${P2_FILES} diff --git a/src/common/util/string_util.cpp b/src/common/util/string_util.cpp index 26ee48c3b..65d1b0f98 100644 --- a/src/common/util/string_util.cpp +++ b/src/common/util/string_util.cpp @@ -42,6 +42,11 @@ void StringUtil::RTrim(std::string *str) { str->erase(std::find_if(str->rbegin(), str->rend(), [](int ch) { return std::isspace(ch) == 0; }).base(), str->end()); } +void StringUtil::LTrim(std::string *str) { + // remove leading ' ', \f, \n, \r, \t, \v + str->erase(str->begin(), std::find_if(str->begin(), str->end(), [](int ch) { return std::isspace(ch) == 0; })); +} + auto StringUtil::Indent(int num_indent) -> std::string { return std::string(num_indent, ' '); } // NOLINT auto StringUtil::StartsWith(const std::string &str, const std::string &prefix) -> bool { @@ -200,6 +205,16 @@ auto StringUtil::Split(const std::string &input, const std::string &split) -> st return splits; } +auto StringUtil::Count(const std::string &input, const std::string &str) -> size_t { + size_t count = 0; + size_t n_pos = input.find(str, 0); // first occurrence + while (n_pos != std::string::npos) { + count++; + n_pos = input.find(str, n_pos + 1); + } + return count; +} + auto StringUtil::Strip(const std::string &str, char c) -> std::string { // There's a copy here which is wasteful, so don't use this in performance-critical code! std::string tmp = str; diff --git a/src/execution/mock_scan_executor.cpp b/src/execution/mock_scan_executor.cpp index 0527b254d..7c3ea3beb 100644 --- a/src/execution/mock_scan_executor.cpp +++ b/src/execution/mock_scan_executor.cpp @@ -33,6 +33,9 @@ static const char *ta_list_2023_fall[] = {"skyzh", "yliang412", "ferna "anurag-23", "Mayank-Baranwal", "abigalekim", "ChaosZhai", "aoleizhou", "averyqi115", "kswim8"}; +static const char *ta_list_2024[] = {"AlSchlo", "walkingcabbages", "averyqi115", "lanlou1554", "sweetsuro", + "ChaosZhai", "SDTheSlayer", "xx01cyx", "yliang412", "thelongmarch-azx"}; + static const char *ta_oh_2022[] = {"Tuesday", "Wednesday", "Monday", "Wednesday", "Thursday", "Friday", "Wednesday", "Randomly", "Tuesday", "Monday", "Tuesday"}; @@ -42,12 +45,15 @@ static const char *ta_oh_2023[] = {"Friday", "Thursday", "Tuesday", "Monday", static const char *ta_oh_2023_fall[] = {"Randomly", "Tuesday", "Wednesday", "Tuesday", "Thursday", "Tuesday", "Friday", "Yesterday", "Friday", "Friday", "Never"}; +static const char *ta_oh_2024[] = {"Friday", "Thursday", "Friday", "Wednesday", "Thursday", + "Yesterday", "Monday", "Tuesday", "Tuesday", "Monday"}; + static const char *course_on_date[] = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"}; const char *mock_table_list[] = {"__mock_table_1", "__mock_table_2", "__mock_table_3", "__mock_table_tas_2022", - "__mock_table_tas_2023", "__mock_table_tas_2023_fall", "__mock_agg_input_small", - "__mock_agg_input_big", "__mock_table_schedule_2022", "__mock_table_schedule_2023", - "__mock_table_123", "__mock_graph", + "__mock_table_tas_2023", "__mock_table_tas_2023_fall", "__mock_table_tas_2024", + "__mock_agg_input_small", "__mock_agg_input_big", "__mock_table_schedule_2022", + "__mock_table_schedule", "__mock_table_123", "__mock_graph", // For leaderboard Q1 "__mock_t1", // For leaderboard Q2 @@ -84,11 +90,15 @@ auto GetMockTableSchemaOf(const std::string &table) -> Schema { return Schema{std::vector{Column{"github_id", TypeId::VARCHAR, 128}, Column{"office_hour", TypeId::VARCHAR, 128}}}; } + if (table == "__mock_table_tas_2024") { + return Schema{std::vector{Column{"github_id", TypeId::VARCHAR, 128}, Column{"office_hour", TypeId::VARCHAR, 128}}}; + } + if (table == "__mock_table_schedule_2022") { return Schema{std::vector{Column{"day_of_week", TypeId::VARCHAR, 128}, Column{"has_lecture", TypeId::INTEGER}}}; } - if (table == "__mock_table_schedule_2023") { + if (table == "__mock_table_schedule") { return Schema{std::vector{Column{"day_of_week", TypeId::VARCHAR, 128}, Column{"has_lecture", TypeId::INTEGER}}}; } @@ -168,11 +178,15 @@ auto GetSizeOf(const MockScanPlanNode *plan) -> size_t { return sizeof(ta_list_2023_fall) / sizeof(ta_list_2023_fall[0]); } + if (table == "__mock_table_tas_2024") { + return sizeof(ta_list_2024) / sizeof(ta_list_2024[0]); + } + if (table == "__mock_table_schedule_2022") { return sizeof(course_on_date) / sizeof(course_on_date[0]); } - if (table == "__mock_table_schedule_2023") { + if (table == "__mock_table_schedule") { return sizeof(course_on_date) / sizeof(course_on_date[0]); } @@ -306,6 +320,15 @@ auto GetFunctionOf(const MockScanPlanNode *plan) -> std::function }; } + if (table == "__mock_table_tas_2024") { + return [plan](size_t cursor) { + std::vector values{}; + values.push_back(ValueFactory::GetVarcharValue(ta_list_2024[cursor])); + values.push_back(ValueFactory::GetVarcharValue(ta_oh_2024[cursor])); + return Tuple{values, &plan->OutputSchema()}; + }; + } + if (table == "__mock_table_schedule_2022") { return [plan](size_t cursor) { std::vector values{}; @@ -315,7 +338,7 @@ auto GetFunctionOf(const MockScanPlanNode *plan) -> std::function }; } - if (table == "__mock_table_schedule_2023") { + if (table == "__mock_table_schedule") { return [plan](size_t cursor) { std::vector values{}; values.push_back(ValueFactory::GetVarcharValue(course_on_date[cursor])); diff --git a/src/include/common/util/string_util.h b/src/include/common/util/string_util.h index 37beffc90..7cb0ab5d9 100644 --- a/src/include/common/util/string_util.h +++ b/src/include/common/util/string_util.h @@ -85,12 +85,21 @@ class StringUtil { /** @return input string split based on the split string */ static auto Split(const std::string &input, const std::string &split) -> std::vector; + /** @return count occurrence of specified string in input string */ + static auto Count(const std::string &input, const std::string &str) -> size_t; + /** * Removes the whitespace characters from the right side of the string. * @param[in,out] str string to be trimmed on the right */ static void RTrim(std::string *str); + /** + * Removes the whitespace characters from the left side of the string. + * @param[in,out] str string to be trimmed on the left + */ + static void LTrim(std::string *str); + /** @return indented string */ static auto Indent(int num_indent) -> std::string; diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index d625f89c4..2f722a803 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -93,6 +93,13 @@ class Optimizer { auto MatchIndex(const std::string &table_name, uint32_t index_key_idx) -> std::optional>; + /** + * @brief column pruning for child plan following a projection plan + * @param plan the plan to optimize + * @return the new plan with column pruning + */ + auto OptimizeColumnPruning(const AbstractPlanNodeRef &plan) -> AbstractPlanNodeRef; + /** * @brief optimize sort + limit as top N */ diff --git a/src/optimizer/CMakeLists.txt b/src/optimizer/CMakeLists.txt index 4c86c7dff..07ff7af9f 100644 --- a/src/optimizer/CMakeLists.txt +++ b/src/optimizer/CMakeLists.txt @@ -12,7 +12,8 @@ add_library( optimizer_internal.cpp order_by_index_scan.cpp sort_limit_as_topn.cpp - seqscan_as_indexscan.cpp) + seqscan_as_indexscan.cpp + column_pruning.cpp) set(ALL_OBJECT_FILES ${ALL_OBJECT_FILES} $ diff --git a/src/optimizer/column_pruning.cpp b/src/optimizer/column_pruning.cpp new file mode 100644 index 000000000..817718be7 --- /dev/null +++ b/src/optimizer/column_pruning.cpp @@ -0,0 +1,13 @@ +#include "optimizer/optimizer.h" + +namespace bustub { + +/** + * @note You may use this function to implement column pruning optimization. + */ +auto Optimizer::OptimizeColumnPruning(const bustub::AbstractPlanNodeRef &plan) -> AbstractPlanNodeRef { + // Your code here + return plan; +} + +} // namespace bustub diff --git a/test/sql/p0.03-string-scan.slt b/test/sql/p0.03-string-scan.slt index 4d017690f..890a010a4 100644 --- a/test/sql/p0.03-string-scan.slt +++ b/test/sql/p0.03-string-scan.slt @@ -1,5 +1,5 @@ query rowsort -select day_of_week, upper(day_of_week), lower(day_of_week), has_lecture from __mock_table_schedule_2023; +select day_of_week, upper(day_of_week), lower(day_of_week), has_lecture from __mock_table_schedule; ---- Monday MONDAY monday 1 Tuesday TUESDAY tuesday 0 diff --git a/test/sql/p3.00-primer.slt b/test/sql/p3.00-primer.slt index 6c4bd034c..a46e6f842 100644 --- a/test/sql/p3.00-primer.slt +++ b/test/sql/p3.00-primer.slt @@ -1,14 +1,13 @@ query rowsort -select github_id, office_hour from __mock_table_tas_2023_fall; +select github_id, office_hour from __mock_table_tas_2024; ---- -skyzh Randomly -yliang412 Tuesday -fernandolis10 Wednesday -wiam8 Tuesday -anurag-23 Thursday -Mayank-Baranwal Tuesday -abigalekim Friday -ChaosZhai Yesterday -aoleizhou Friday +AlSchlo Friday +walkingcabbages Thursday averyqi115 Friday -kswim8 Never +lanlou1554 Wednesday +sweetsuro Thursday +ChaosZhai Yesterday +SDTheSlayer Monday +xx01cyx Tuesday +yliang412 Tuesday +thelongmarch-azx Monday diff --git a/test/sql/p3.07-simple-agg.slt b/test/sql/p3.07-simple-agg.slt index c10252f33..2e3d21f27 100644 --- a/test/sql/p3.07-simple-agg.slt +++ b/test/sql/p3.07-simple-agg.slt @@ -1,10 +1,10 @@ # 4 pts -# How many TAs are there in 2023 Fall? +# How many TAs are there in 2024 Spring? query -select count(*) from __mock_table_tas_2023_fall; +select count(*) from __mock_table_tas_2024; ---- -11 +10 # The real test process begins... diff --git a/test/sql/p3.08-group-agg-1.slt b/test/sql/p3.08-group-agg-1.slt index d13c66934..15499e5cf 100644 --- a/test/sql/p3.08-group-agg-1.slt +++ b/test/sql/p3.08-group-agg-1.slt @@ -4,15 +4,14 @@ # "rowsort" means that the order of result doesn't matter. query rowsort -select office_hour, count(*) from __mock_table_tas_2023_fall group by office_hour; +select office_hour, count(*) from __mock_table_tas_2024 group by office_hour; ---- -Never 1 +Tuesday 2 +Friday 2 +Monday 2 Yesterday 1 -Friday 3 -Thursday 1 Wednesday 1 -Tuesday 3 -Randomly 1 +Thursday 2 # The real test process begins... diff --git a/test/sql/p3.10-simple-join.slt b/test/sql/p3.10-simple-join.slt index cb16009dd..0e9427825 100644 --- a/test/sql/p3.10-simple-join.slt +++ b/test/sql/p3.10-simple-join.slt @@ -8,11 +8,14 @@ set force_optimizer_starter_rule=yes query rowsort select * from - __mock_table_tas_2023_fall inner join __mock_table_schedule_2023 + __mock_table_tas_2024 inner join __mock_table_schedule on office_hour = day_of_week where has_lecture = 1; ---- -fernandolis10 Wednesday Wednesday 1 +lanlou1554 Wednesday Wednesday 1 +SDTheSlayer Monday Monday 1 +thelongmarch-azx Monday Monday 1 + # The real test begins... diff --git a/test/sql/p3.14-hash-join.slt b/test/sql/p3.14-hash-join.slt index c5aa5d3af..b3acaaab8 100644 --- a/test/sql/p3.14-hash-join.slt +++ b/test/sql/p3.14-hash-join.slt @@ -5,11 +5,14 @@ query rowsort +ensure:hash_join select * from - __mock_table_tas_2023_fall inner join __mock_table_schedule_2023 + __mock_table_tas_2024 inner join __mock_table_schedule on office_hour = day_of_week where has_lecture = 1; ---- -fernandolis10 Wednesday Wednesday 1 +SDTheSlayer Monday Monday 1 +thelongmarch-azx Monday Monday 1 +lanlou1554 Wednesday Wednesday 1 + # The real test begins... diff --git a/test/sql/p3.16-sort-limit.slt b/test/sql/p3.16-sort-limit.slt index 338548838..951853cc5 100644 --- a/test/sql/p3.16-sort-limit.slt +++ b/test/sql/p3.16-sort-limit.slt @@ -4,51 +4,33 @@ # Default query -select * from __mock_table_tas_2023_fall order by office_hour, github_id; +select * from __mock_table_tas_2024 order by office_hour, github_id; ---- -abigalekim Friday -aoleizhou Friday +AlSchlo Friday averyqi115 Friday -kswim8 Never -skyzh Randomly -anurag-23 Thursday -Mayank-Baranwal Tuesday -wiam8 Tuesday +SDTheSlayer Monday +thelongmarch-azx Monday +sweetsuro Thursday +walkingcabbages Thursday +xx01cyx Tuesday yliang412 Tuesday -fernandolis10 Wednesday +lanlou1554 Wednesday ChaosZhai Yesterday # ASC query -select * from __mock_table_tas_2023_fall order by office_hour asc, github_id desc; +select * from __mock_table_tas_2024 order by office_hour asc, github_id desc; ---- averyqi115 Friday -aoleizhou Friday -abigalekim Friday -kswim8 Never -skyzh Randomly -anurag-23 Thursday +AlSchlo Friday +thelongmarch-azx Monday +SDTheSlayer Monday +walkingcabbages Thursday +sweetsuro Thursday yliang412 Tuesday -wiam8 Tuesday -Mayank-Baranwal Tuesday -fernandolis10 Wednesday -ChaosZhai Yesterday - - -query -select * from __mock_table_tas_2023_fall order by github_id desc; ----- -yliang412 Tuesday -wiam8 Tuesday -skyzh Randomly -kswim8 Never -fernandolis10 Wednesday -averyqi115 Friday -aoleizhou Friday -anurag-23 Thursday -abigalekim Friday -Mayank-Baranwal Tuesday +xx01cyx Tuesday +lanlou1554 Wednesday ChaosZhai Yesterday @@ -382,7 +364,9 @@ select * from __mock_table_123, (select * from temp_1 order by colA desc limit 3 3 98 3 1394 17139 3 97 1 2273 63790 -query rowsort +ensure:nlj_init_check +# remember to enable your hash join optimizer to pass this +# you could disable this ensure if you haven't implemented it yet +query rowsort +ensure:hash_join select * from temp_2 t2 inner join (select colB, colD, colA, colC from temp_1 order by colB desc, colD, colA desc limit 10 ) t1 diff --git a/tools/sqllogictest/sqllogictest.cpp b/tools/sqllogictest/sqllogictest.cpp index 8a7338e19..f78920849 100644 --- a/tools/sqllogictest/sqllogictest.cpp +++ b/tools/sqllogictest/sqllogictest.cpp @@ -69,7 +69,7 @@ auto ProcessExtraOptions(const std::string &sql, bustub::BustubInstance &instanc if (bustub::StringUtil::StartsWith(opt, "ensure:")) { std::stringstream result; auto writer = bustub::SimpleStreamWriter(result); - instance.ExecuteSql("explain " + sql, writer); + instance.ExecuteSql("explain (o) " + sql, writer); if (opt == "ensure:index_scan") { if (!bustub::StringUtil::Contains(result.str(), "IndexScan")) { @@ -129,6 +129,38 @@ auto ProcessExtraOptions(const std::string &sql, bustub::BustubInstance &instanc return false; } check_options->check_options_set_.emplace(bustub::CheckOption::ENABLE_NLJ_CHECK); + } else if (bustub::StringUtil::StartsWith(opt, "ensure:column-pruned")) { + auto args = bustub::StringUtil::Split(opt, ":"); + if (args.size() != 4) { + throw bustub::NotImplementedException(fmt::format("unsupported extra option: {}", opt)); + } + auto expected_cols_proj = std::stoi(args[2]); + auto expected_cols_agg = std::stoi(args[3]); + // find agg & proj plan and test if the output schema has the expected number of columns + auto lines = bustub::StringUtil::Split(result.str(), "\n"); + for (auto &line : lines) { + bustub::StringUtil::LTrim(&line); + if (bustub::StringUtil::StartsWith(line, "Agg")) { + auto cols = bustub::StringUtil::Split(line, "],"); + if (cols.size() != 3) { + fmt::print("Agg plan wrong formatting!\n"); + return false; + } + for (int i = 0; i < 2; i++) { + if (bustub::StringUtil::Count(cols[i], "\",")+1 > static_cast(expected_cols_agg)) { + fmt::print("Agg wrong column pruning count!\n"); + return false; + } + } + break; + } + if (bustub::StringUtil::StartsWith(line, "Projection")) { + if (bustub::StringUtil::Count(line, "\",")+1 > static_cast(expected_cols_proj)) { + fmt::print("Projection wrong column pruning count!\n"); + return false; + } + } + } } else { throw bustub::NotImplementedException(fmt::format("unsupported extra option: {}", opt)); }