Skip to content

Commit

Permalink
Rj/col pruning (#704)
Browse files Browse the repository at this point in the history
* col prune prep files

* add s24 tas :)
  • Loading branch information
ChaosZhai committed Mar 13, 2024
1 parent dba9b72 commit 0074882
Show file tree
Hide file tree
Showing 15 changed files with 157 additions and 68 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ set(P3_FILES
"src/optimizer/sort_limit_as_topn.cpp"
"src/optimizer/optimizer_internal.cpp"
"src/optimizer/seqscan_as_indexscan.cpp"
"src/optimizer/column_pruning.cpp"
"src/common/bustub_ddl.cpp"
"src/include/execution/plans/topn_per_group_plan.h"
${P2_FILES}
Expand Down
15 changes: 15 additions & 0 deletions src/common/util/string_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ void StringUtil::RTrim(std::string *str) {
str->erase(std::find_if(str->rbegin(), str->rend(), [](int ch) { return std::isspace(ch) == 0; }).base(), str->end());
}

void StringUtil::LTrim(std::string *str) {
// remove leading ' ', \f, \n, \r, \t, \v
str->erase(str->begin(), std::find_if(str->begin(), str->end(), [](int ch) { return std::isspace(ch) == 0; }));
}

auto StringUtil::Indent(int num_indent) -> std::string { return std::string(num_indent, ' '); } // NOLINT

auto StringUtil::StartsWith(const std::string &str, const std::string &prefix) -> bool {
Expand Down Expand Up @@ -200,6 +205,16 @@ auto StringUtil::Split(const std::string &input, const std::string &split) -> st
return splits;
}

auto StringUtil::Count(const std::string &input, const std::string &str) -> size_t {
size_t count = 0;
size_t n_pos = input.find(str, 0); // first occurrence
while (n_pos != std::string::npos) {
count++;
n_pos = input.find(str, n_pos + 1);
}
return count;
}

auto StringUtil::Strip(const std::string &str, char c) -> std::string {
// There's a copy here which is wasteful, so don't use this in performance-critical code!
std::string tmp = str;
Expand Down
35 changes: 29 additions & 6 deletions src/execution/mock_scan_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ static const char *ta_list_2023_fall[] = {"skyzh", "yliang412", "ferna
"anurag-23", "Mayank-Baranwal", "abigalekim", "ChaosZhai",
"aoleizhou", "averyqi115", "kswim8"};

static const char *ta_list_2024[] = {"AlSchlo", "walkingcabbages", "averyqi115", "lanlou1554", "sweetsuro",
"ChaosZhai", "SDTheSlayer", "xx01cyx", "yliang412", "thelongmarch-azx"};

static const char *ta_oh_2022[] = {"Tuesday", "Wednesday", "Monday", "Wednesday", "Thursday", "Friday",
"Wednesday", "Randomly", "Tuesday", "Monday", "Tuesday"};

Expand All @@ -42,12 +45,15 @@ static const char *ta_oh_2023[] = {"Friday", "Thursday", "Tuesday", "Monday",
static const char *ta_oh_2023_fall[] = {"Randomly", "Tuesday", "Wednesday", "Tuesday", "Thursday", "Tuesday",
"Friday", "Yesterday", "Friday", "Friday", "Never"};

static const char *ta_oh_2024[] = {"Friday", "Thursday", "Friday", "Wednesday", "Thursday",
"Yesterday", "Monday", "Tuesday", "Tuesday", "Monday"};

static const char *course_on_date[] = {"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"};

const char *mock_table_list[] = {"__mock_table_1", "__mock_table_2", "__mock_table_3", "__mock_table_tas_2022",
"__mock_table_tas_2023", "__mock_table_tas_2023_fall", "__mock_agg_input_small",
"__mock_agg_input_big", "__mock_table_schedule_2022", "__mock_table_schedule_2023",
"__mock_table_123", "__mock_graph",
"__mock_table_tas_2023", "__mock_table_tas_2023_fall", "__mock_table_tas_2024",
"__mock_agg_input_small", "__mock_agg_input_big", "__mock_table_schedule_2022",
"__mock_table_schedule", "__mock_table_123", "__mock_graph",
// For leaderboard Q1
"__mock_t1",
// For leaderboard Q2
Expand Down Expand Up @@ -84,11 +90,15 @@ auto GetMockTableSchemaOf(const std::string &table) -> Schema {
return Schema{std::vector{Column{"github_id", TypeId::VARCHAR, 128}, Column{"office_hour", TypeId::VARCHAR, 128}}};
}

if (table == "__mock_table_tas_2024") {
return Schema{std::vector{Column{"github_id", TypeId::VARCHAR, 128}, Column{"office_hour", TypeId::VARCHAR, 128}}};
}

if (table == "__mock_table_schedule_2022") {
return Schema{std::vector{Column{"day_of_week", TypeId::VARCHAR, 128}, Column{"has_lecture", TypeId::INTEGER}}};
}

if (table == "__mock_table_schedule_2023") {
if (table == "__mock_table_schedule") {
return Schema{std::vector{Column{"day_of_week", TypeId::VARCHAR, 128}, Column{"has_lecture", TypeId::INTEGER}}};
}

Expand Down Expand Up @@ -168,11 +178,15 @@ auto GetSizeOf(const MockScanPlanNode *plan) -> size_t {
return sizeof(ta_list_2023_fall) / sizeof(ta_list_2023_fall[0]);
}

if (table == "__mock_table_tas_2024") {
return sizeof(ta_list_2024) / sizeof(ta_list_2024[0]);
}

if (table == "__mock_table_schedule_2022") {
return sizeof(course_on_date) / sizeof(course_on_date[0]);
}

if (table == "__mock_table_schedule_2023") {
if (table == "__mock_table_schedule") {
return sizeof(course_on_date) / sizeof(course_on_date[0]);
}

Expand Down Expand Up @@ -306,6 +320,15 @@ auto GetFunctionOf(const MockScanPlanNode *plan) -> std::function<Tuple(size_t)>
};
}

if (table == "__mock_table_tas_2024") {
return [plan](size_t cursor) {
std::vector<Value> values{};
values.push_back(ValueFactory::GetVarcharValue(ta_list_2024[cursor]));
values.push_back(ValueFactory::GetVarcharValue(ta_oh_2024[cursor]));
return Tuple{values, &plan->OutputSchema()};
};
}

if (table == "__mock_table_schedule_2022") {
return [plan](size_t cursor) {
std::vector<Value> values{};
Expand All @@ -315,7 +338,7 @@ auto GetFunctionOf(const MockScanPlanNode *plan) -> std::function<Tuple(size_t)>
};
}

if (table == "__mock_table_schedule_2023") {
if (table == "__mock_table_schedule") {
return [plan](size_t cursor) {
std::vector<Value> values{};
values.push_back(ValueFactory::GetVarcharValue(course_on_date[cursor]));
Expand Down
9 changes: 9 additions & 0 deletions src/include/common/util/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,21 @@ class StringUtil {
/** @return input string split based on the split string */
static auto Split(const std::string &input, const std::string &split) -> std::vector<std::string>;

/** @return count occurrence of specified string in input string */
static auto Count(const std::string &input, const std::string &str) -> size_t;

/**
* Removes the whitespace characters from the right side of the string.
* @param[in,out] str string to be trimmed on the right
*/
static void RTrim(std::string *str);

/**
* Removes the whitespace characters from the left side of the string.
* @param[in,out] str string to be trimmed on the left
*/
static void LTrim(std::string *str);

/** @return indented string */
static auto Indent(int num_indent) -> std::string;

Expand Down
7 changes: 7 additions & 0 deletions src/include/optimizer/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ class Optimizer {
auto MatchIndex(const std::string &table_name, uint32_t index_key_idx)
-> std::optional<std::tuple<index_oid_t, std::string>>;

/**
* @brief column pruning for child plan following a projection plan
* @param plan the plan to optimize
* @return the new plan with column pruning
*/
auto OptimizeColumnPruning(const AbstractPlanNodeRef &plan) -> AbstractPlanNodeRef;

/**
* @brief optimize sort + limit as top N
*/
Expand Down
3 changes: 2 additions & 1 deletion src/optimizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ add_library(
optimizer_internal.cpp
order_by_index_scan.cpp
sort_limit_as_topn.cpp
seqscan_as_indexscan.cpp)
seqscan_as_indexscan.cpp
column_pruning.cpp)

set(ALL_OBJECT_FILES
${ALL_OBJECT_FILES} $<TARGET_OBJECTS:bustub_optimizer>
Expand Down
13 changes: 13 additions & 0 deletions src/optimizer/column_pruning.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "optimizer/optimizer.h"

namespace bustub {

/**
* @note You may use this function to implement column pruning optimization.
*/
auto Optimizer::OptimizeColumnPruning(const bustub::AbstractPlanNodeRef &plan) -> AbstractPlanNodeRef {
// Your code here
return plan;
}

} // namespace bustub
2 changes: 1 addition & 1 deletion test/sql/p0.03-string-scan.slt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
query rowsort
select day_of_week, upper(day_of_week), lower(day_of_week), has_lecture from __mock_table_schedule_2023;
select day_of_week, upper(day_of_week), lower(day_of_week), has_lecture from __mock_table_schedule;
----
Monday MONDAY monday 1
Tuesday TUESDAY tuesday 0
Expand Down
21 changes: 10 additions & 11 deletions test/sql/p3.00-primer.slt
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
query rowsort
select github_id, office_hour from __mock_table_tas_2023_fall;
select github_id, office_hour from __mock_table_tas_2024;
----
skyzh Randomly
yliang412 Tuesday
fernandolis10 Wednesday
wiam8 Tuesday
anurag-23 Thursday
Mayank-Baranwal Tuesday
abigalekim Friday
ChaosZhai Yesterday
aoleizhou Friday
AlSchlo Friday
walkingcabbages Thursday
averyqi115 Friday
kswim8 Never
lanlou1554 Wednesday
sweetsuro Thursday
ChaosZhai Yesterday
SDTheSlayer Monday
xx01cyx Tuesday
yliang412 Tuesday
thelongmarch-azx Monday
6 changes: 3 additions & 3 deletions test/sql/p3.07-simple-agg.slt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# 4 pts

# How many TAs are there in 2023 Fall?
# How many TAs are there in 2024 Spring?
query
select count(*) from __mock_table_tas_2023_fall;
select count(*) from __mock_table_tas_2024;
----
11
10

# The real test process begins...

Expand Down
11 changes: 5 additions & 6 deletions test/sql/p3.08-group-agg-1.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
# "rowsort" means that the order of result doesn't matter.

query rowsort
select office_hour, count(*) from __mock_table_tas_2023_fall group by office_hour;
select office_hour, count(*) from __mock_table_tas_2024 group by office_hour;
----
Never 1
Tuesday 2
Friday 2
Monday 2
Yesterday 1
Friday 3
Thursday 1
Wednesday 1
Tuesday 3
Randomly 1
Thursday 2

# The real test process begins...

Expand Down
7 changes: 5 additions & 2 deletions test/sql/p3.10-simple-join.slt
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ set force_optimizer_starter_rule=yes

query rowsort
select * from
__mock_table_tas_2023_fall inner join __mock_table_schedule_2023
__mock_table_tas_2024 inner join __mock_table_schedule
on office_hour = day_of_week
where has_lecture = 1;
----
fernandolis10 Wednesday Wednesday 1
lanlou1554 Wednesday Wednesday 1
SDTheSlayer Monday Monday 1
thelongmarch-azx Monday Monday 1


# The real test begins...

Expand Down
7 changes: 5 additions & 2 deletions test/sql/p3.14-hash-join.slt
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@

query rowsort +ensure:hash_join
select * from
__mock_table_tas_2023_fall inner join __mock_table_schedule_2023
__mock_table_tas_2024 inner join __mock_table_schedule
on office_hour = day_of_week
where has_lecture = 1;
----
fernandolis10 Wednesday Wednesday 1
SDTheSlayer Monday Monday 1
thelongmarch-azx Monday Monday 1
lanlou1554 Wednesday Wednesday 1


# The real test begins...

Expand Down
54 changes: 19 additions & 35 deletions test/sql/p3.16-sort-limit.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,33 @@

# Default
query
select * from __mock_table_tas_2023_fall order by office_hour, github_id;
select * from __mock_table_tas_2024 order by office_hour, github_id;
----
abigalekim Friday
aoleizhou Friday
AlSchlo Friday
averyqi115 Friday
kswim8 Never
skyzh Randomly
anurag-23 Thursday
Mayank-Baranwal Tuesday
wiam8 Tuesday
SDTheSlayer Monday
thelongmarch-azx Monday
sweetsuro Thursday
walkingcabbages Thursday
xx01cyx Tuesday
yliang412 Tuesday
fernandolis10 Wednesday
lanlou1554 Wednesday
ChaosZhai Yesterday


# ASC
query
select * from __mock_table_tas_2023_fall order by office_hour asc, github_id desc;
select * from __mock_table_tas_2024 order by office_hour asc, github_id desc;
----
averyqi115 Friday
aoleizhou Friday
abigalekim Friday
kswim8 Never
skyzh Randomly
anurag-23 Thursday
AlSchlo Friday
thelongmarch-azx Monday
SDTheSlayer Monday
walkingcabbages Thursday
sweetsuro Thursday
yliang412 Tuesday
wiam8 Tuesday
Mayank-Baranwal Tuesday
fernandolis10 Wednesday
ChaosZhai Yesterday


query
select * from __mock_table_tas_2023_fall order by github_id desc;
----
yliang412 Tuesday
wiam8 Tuesday
skyzh Randomly
kswim8 Never
fernandolis10 Wednesday
averyqi115 Friday
aoleizhou Friday
anurag-23 Thursday
abigalekim Friday
Mayank-Baranwal Tuesday
xx01cyx Tuesday
lanlou1554 Wednesday
ChaosZhai Yesterday


Expand Down Expand Up @@ -382,7 +364,9 @@ select * from __mock_table_123, (select * from temp_1 order by colA desc limit 3
3 98 3 1394 17139
3 97 1 2273 63790

query rowsort +ensure:nlj_init_check
# remember to enable your hash join optimizer to pass this
# you could disable this ensure if you haven't implemented it yet
query rowsort +ensure:hash_join
select * from
temp_2 t2 inner join
(select colB, colD, colA, colC from temp_1 order by colB desc, colD, colA desc limit 10 ) t1
Expand Down
Loading

0 comments on commit 0074882

Please sign in to comment.