-
Notifications
You must be signed in to change notification settings - Fork 2.6k
No mark to semi join if mark index is in projection #12828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a03011f
adbc0de
466f64d
5b28860
6efbc88
a9f3586
6411f88
4b1be69
cbc03cc
d3fea9f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,89 @@ | ||
#include "duckdb/optimizer/filter_pushdown.hpp" | ||
|
||
#include "duckdb/optimizer/filter_combiner.hpp" | ||
#include "duckdb/optimizer/optimizer.hpp" | ||
#include "duckdb/planner/expression_iterator.hpp" | ||
#include "duckdb/planner/operator/logical_filter.hpp" | ||
#include "duckdb/planner/operator/logical_join.hpp" | ||
#include "duckdb/planner/operator/logical_window.hpp" | ||
#include "duckdb/optimizer/optimizer.hpp" | ||
#include "duckdb/planner/operator/logical_projection.hpp" | ||
#include "duckdb/planner/operator/logical_comparison_join.hpp" | ||
|
||
namespace duckdb { | ||
|
||
using Filter = FilterPushdown::Filter; | ||
|
||
FilterPushdown::FilterPushdown(Optimizer &optimizer, bool convert_mark_joins) | ||
static unordered_set<idx_t> GetMarkJoinIndexes(LogicalOperator &plan, unordered_set<idx_t> &table_bindings) { | ||
unordered_set<idx_t> projected_mark_join_indexes; | ||
switch (plan.type) { | ||
case LogicalOperatorType::LOGICAL_COMPARISON_JOIN: { | ||
auto &join = plan.Cast<LogicalComparisonJoin>(); | ||
if (join.join_type != JoinType::MARK) { | ||
break; | ||
} | ||
// if the projected table bindings include the mark join index, | ||
if (table_bindings.find(join.mark_index) != table_bindings.end()) { | ||
projected_mark_join_indexes.insert(join.mark_index); | ||
} | ||
break; | ||
} | ||
// you need to store table.column index. | ||
// if you get to a projection, you need to change the table_bindings passed so they reflect the | ||
// table index of the original expression they originated from. | ||
case LogicalOperatorType::LOGICAL_PROJECTION: { | ||
// when we encounter a projection, replace the table_bindings with | ||
// the tables in the projection | ||
auto plan_bindings = plan.GetColumnBindings(); | ||
auto &proj = plan.Cast<LogicalProjection>(); | ||
auto proj_bindings = proj.GetColumnBindings(); | ||
unordered_set<idx_t> new_table_bindings; | ||
for (auto &binding : proj_bindings) { | ||
auto col_index = binding.column_index; | ||
auto &expr = proj.expressions.at(col_index); | ||
vector<ColumnBinding> bindings_to_keep; | ||
ExpressionIterator::EnumerateExpression(expr, [&](Expression &child) { | ||
if (expr->expression_class == ExpressionClass::BOUND_COLUMN_REF) { | ||
auto &col_ref = expr->Cast<BoundColumnRefExpression>(); | ||
bindings_to_keep.push_back(col_ref.binding); | ||
} | ||
}); | ||
for (auto &expr_binding : bindings_to_keep) { | ||
new_table_bindings.insert(expr_binding.table_index); | ||
} | ||
table_bindings = new_table_bindings; | ||
} | ||
break; | ||
} | ||
default: | ||
break; | ||
} | ||
|
||
// recurse into the children to find mark joins and project their indexes. | ||
for (auto &child : plan.children) { | ||
auto extra_mark_indexes = GetMarkJoinIndexes(*child, table_bindings); | ||
for (auto extra_index : extra_mark_indexes) { | ||
projected_mark_join_indexes.insert(extra_index); | ||
} | ||
} | ||
return projected_mark_join_indexes; | ||
} | ||
|
||
FilterPushdown::FilterPushdown(Optimizer &optimizer, LogicalOperator &plan, bool convert_mark_joins) | ||
: optimizer(optimizer), combiner(optimizer.context), convert_mark_joins(convert_mark_joins) { | ||
|
||
unordered_set<idx_t> table_bindings; | ||
// other operators have not yet removed unused columns, so they will project mark joins | ||
// even though they are not needed at this time | ||
for (auto &binding : plan.GetColumnBindings()) { | ||
table_bindings.insert(binding.table_index); | ||
} | ||
projected_mark_indexes = GetMarkJoinIndexes(plan, table_bindings); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are now calling I think we should invert this in a stand-alone optimizer for the mark join, i.e.: first we find a mark join, then after finding a mark join, we traverse the relevant part of the query tree (e.g. Then if we only find it in the filter we turn the mark join into a semi join. If no mark joins are present we don't need to do any additional work in that case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that this should probably just be a different operator, but I don't think Regardless, I'll work on moving this functionality to a new optimizer |
||
} | ||
|
||
FilterPushdown::FilterPushdown(Optimizer &optimizer, const unordered_set<idx_t> &projected_mark_indexes, | ||
bool convert_mark_joins) | ||
: optimizer(optimizer), combiner(optimizer.context), projected_mark_indexes(projected_mark_indexes), | ||
convert_mark_joins(convert_mark_joins) { | ||
} | ||
|
||
unique_ptr<LogicalOperator> FilterPushdown::Rewrite(unique_ptr<LogicalOperator> op) { | ||
|
@@ -148,7 +220,7 @@ unique_ptr<LogicalOperator> FilterPushdown::PushFinalFilters(unique_ptr<LogicalO | |
unique_ptr<LogicalOperator> FilterPushdown::FinishPushdown(unique_ptr<LogicalOperator> op) { | ||
// unhandled type, first perform filter pushdown in its children | ||
for (auto &child : op->children) { | ||
FilterPushdown pushdown(optimizer, convert_mark_joins); | ||
FilterPushdown pushdown(optimizer, *child, convert_mark_joins); | ||
child = pushdown.Rewrite(std::move(child)); | ||
} | ||
// now push any existing filters | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,14 +9,16 @@ using Filter = FilterPushdown::Filter; | |
unique_ptr<LogicalOperator> FilterPushdown::PushdownMarkJoin(unique_ptr<LogicalOperator> op, | ||
unordered_set<idx_t> &left_bindings, | ||
unordered_set<idx_t> &right_bindings) { | ||
auto op_bindings = op->GetColumnBindings(); | ||
auto &join = op->Cast<LogicalJoin>(); | ||
auto &comp_join = op->Cast<LogicalComparisonJoin>(); | ||
D_ASSERT(join.join_type == JoinType::MARK); | ||
D_ASSERT(op->type == LogicalOperatorType::LOGICAL_COMPARISON_JOIN || | ||
op->type == LogicalOperatorType::LOGICAL_DELIM_JOIN || op->type == LogicalOperatorType::LOGICAL_ASOF_JOIN); | ||
|
||
right_bindings.insert(comp_join.mark_index); | ||
FilterPushdown left_pushdown(optimizer, convert_mark_joins), right_pushdown(optimizer, convert_mark_joins); | ||
FilterPushdown left_pushdown(optimizer, projected_mark_indexes, convert_mark_joins), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering if the Perhaps we should move this to a separate optimizer altogether so that we can clean up the filter pushdown optimizer and leave it to only push down filters? I think it was originally added there because it seemed easy but given that this has caused a number of issues it seems that was the wrong assumption. |
||
right_pushdown(optimizer, projected_mark_indexes, convert_mark_joins); | ||
#ifdef DEBUG | ||
bool simplified_mark_join = false; | ||
#endif | ||
|
@@ -35,7 +37,8 @@ unique_ptr<LogicalOperator> FilterPushdown::PushdownMarkJoin(unique_ptr<LogicalO | |
#endif | ||
// this filter references the marker | ||
// we can turn this into a SEMI join if the filter is on only the marker | ||
if (filters[i]->filter->type == ExpressionType::BOUND_COLUMN_REF && convert_mark_joins) { | ||
if (filters[i]->filter->type == ExpressionType::BOUND_COLUMN_REF && convert_mark_joins && | ||
projected_mark_indexes.find(join.mark_index) == projected_mark_indexes.end()) { | ||
// filter just references the marker: turn into semi join | ||
#ifdef DEBUG | ||
simplified_mark_join = true; | ||
|
@@ -61,7 +64,8 @@ unique_ptr<LogicalOperator> FilterPushdown::PushdownMarkJoin(unique_ptr<LogicalO | |
break; | ||
} | ||
} | ||
if (all_null_values_are_equal && convert_mark_joins) { | ||
if (all_null_values_are_equal && convert_mark_joins && | ||
projected_mark_indexes.find(join.mark_index) == projected_mark_indexes.end()) { | ||
#ifdef DEBUG | ||
simplified_mark_join = true; | ||
#endif | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# name: test/optimizer/pushdown/no_mark_to_semi_if_mark_index_is_projected.test | ||
# description: No mark to semi conversion if the mark join index is projected | ||
# group: [pushdown] | ||
|
||
statement ok | ||
CREATE OR REPLACE TABLE BaseData AS ( | ||
SELECT | ||
'10' AS my_key, | ||
'20' AS parent_key, | ||
'30' AS payload, | ||
'40' as foo, | ||
'50' as foo2, | ||
'60' as foo3 | ||
); | ||
|
||
|
||
# Original query | ||
query III | ||
WITH | ||
Example AS ( | ||
SELECT | ||
c.my_key, | ||
(c.parent_key IN (SELECT my_key FROM BaseData)) AS parentExists, | ||
p.my_key IS NOT NULL AS parentExists2, | ||
FROM BaseData AS c | ||
LEFT JOIN BaseData AS p ON c.parent_key = p.my_key | ||
) | ||
SELECT * | ||
FROM Example | ||
WHERE parentExists | ||
---- | ||
|
||
# original query no CTE | ||
query III | ||
SELECT | ||
c.my_key, | ||
(c.parent_key IN (SELECT my_key FROM BaseData)) AS parentExists, | ||
p.my_key IS NOT NULL AS parentExists2, | ||
FROM BaseData AS c | ||
LEFT JOIN BaseData AS p ON c.parent_key = p.my_key | ||
WHERE parentExists; | ||
---- | ||
|
||
# original query but the CTE is a subquery | ||
query III | ||
SELECT * | ||
FROM (SELECT | ||
c.my_key, | ||
(c.parent_key IN (SELECT my_key FROM BaseData)) AS parentExists, | ||
p.my_key IS NOT NULL AS parentExists2, | ||
FROM BaseData AS c | ||
LEFT JOIN BaseData AS p ON c.parent_key = p.my_key | ||
) | ||
WHERE parentExists; | ||
---- | ||
|
||
statement ok | ||
PRAGMA explain_output='optimized_only' | ||
|
||
query II | ||
EXPLAIN | ||
WITH Example AS ( | ||
SELECT | ||
c.my_key, | ||
(c.parent_key IN (SELECT my_key FROM BaseData)) AS parentExists, | ||
p.my_key IS NOT NULL AS parentExists2, | ||
FROM BaseData AS c | ||
LEFT JOIN BaseData AS p ON c.parent_key = p.my_key | ||
) | ||
SELECT * | ||
FROM Example | ||
WHERE parentExists | ||
---- | ||
logical_opt <REGEX>:.*MARK.* | ||
|
||
query II | ||
EXPLAIN | ||
WITH Example AS ( | ||
SELECT | ||
c.my_key, | ||
(c.parent_key IN (SELECT my_key FROM BaseData)) AS parentExists, | ||
p.my_key IS NOT NULL AS parentExists2, | ||
FROM BaseData AS c | ||
LEFT JOIN BaseData AS p ON c.parent_key = p.my_key | ||
) | ||
SELECT * | ||
FROM Example | ||
WHERE parentExists | ||
---- | ||
logical_opt <!REGEX>:.*SEMI.* | ||
|
||
statement ok | ||
create table t0 as select range a from range(300); | ||
statement ok | ||
create table t2 as select range b from range(50000); | ||
|
||
query I | ||
select sum(in_alias::INT) FROM (select a in (select b from t2) as in_alias from t0) where in_alias; | ||
---- | ||
300 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
ExpressionIterator
only iterates one level - I'm not sure if that is sufficient here.The following query still seems to throw an internal exception:
The
LogicalOperatorVisitor
can do recursive visiting of expressions that should likely be used here.