Willy-Bas Loos
2014-09-12 16:19:31 UTC
Hi,
Today i ran into a situation where a second left join on an indexed field
would prevent the index from being used, even though the index is clearly
more efficient.
Removing either of the 2 joins would cause that the planner will use the
index again.
I tested this on postgres 9.1 and 9.3 on my ubuntu (amd64) laptop.
--Here's the test data:
create table a (id serial primary key, field1 text);
create table b (id integer, title text, lang integer);
create index b_title_lowerto on b using btree (lower(title)
text_pattern_ops);
vacuum analyze;
with x as (
insert into a
select generate_series(1,40000) as id
returning id
)
insert into b
select id, translate((random()*100*id)::text, '1234567890.', 'abcdefghij'),
1
from x;
update a set field1=translate(id::text, '1234567890.', 'abcdefghij');
insert into b
select b2.id, translate((random()*100*b2.id)::text, '1234567890.',
'abcdefghij'), 2
from b b2;
--Here's the query that doesn't use the index on "b":
select a.field1, b1.title , b2.title
from a
left join b b1 on b1.id = a.id and b1.lang=1
left join b b2 on b2.id = a.id and b2.lang=2
where (lower(b1.title) like'abcd%' or lower(b2.title) like 'abcd%')
--plan:
Hash Right Join (cost=4298.60..7214.76 rows=8 width=35)
Hash Cond: (b1.id = a.id)
Filter: ((lower(b1.title) ~~ 'abcd%'::text) OR (lower(b2.title) ~~
'abcd%'::text))
-> Seq Scan on b b1 (cost=0.00..1510.00 rows=40176 width=19)
Filter: (lang = 1)
-> Hash (cost=3798.60..3798.60 rows=40000 width=24)
-> Hash Right Join (cost=1293.00..3798.60 rows=40000 width=24)
Hash Cond: (b2.id = a.id)
-> Seq Scan on b b2 (cost=0.00..1510.00 rows=39824 width=19)
Filter: (lang = 2)
-> Hash (cost=793.00..793.00 rows=40000 width=9)
-> Seq Scan on a (cost=0.00..793.00 rows=40000
width=9)
--Here's the query that does use the index on "b":
select a.field1, b1.title
from a
left join b b1 on b1.id = a.id and b1.lang=1
where lower(b1.title) like 'abcd%'
union
select a.field1, b2.title
from a
left join b b2 on b2.id = a.id and b2.lang=2
where lower(b2.title) like 'abcd%'
--plan:
HashAggregate (cost=98.31..98.39 rows=8 width=20)
-> Append (cost=4.74..98.27 rows=8 width=20)
-> Nested Loop (cost=4.74..49.10 rows=4 width=20)
-> Bitmap Heap Scan on b b1 (cost=4.45..15.82 rows=4
width=19)
Filter: ((lang = 1) AND (lower(title) ~~ 'abcd%'::text))
-> Bitmap Index Scan on b_title_lowerto
(cost=0.00..4.45 rows=3 width=0)
Index Cond: ((lower(title) ~>=~ 'abcd'::text) AND
(lower(title) ~<~ 'abce'::text))
-> Index Scan using a_pkey on a (cost=0.29..8.31 rows=1
width=9)
Index Cond: (id = b1.id)
-> Nested Loop (cost=4.74..49.10 rows=4 width=20)
-> Bitmap Heap Scan on b b2 (cost=4.45..15.82 rows=4
width=19)
Filter: ((lang = 2) AND (lower(title) ~~ 'abcd%'::text))
-> Bitmap Index Scan on b_title_lowerto
(cost=0.00..4.45 rows=3 width=0)
Index Cond: ((lower(title) ~>=~ 'abcd'::text) AND
(lower(title) ~<~ 'abce'::text))
-> Index Scan using a_pkey on a a_1 (cost=0.29..8.31 rows=1
width=9)
Index Cond: (id = b2.id)
As you can see, the second query is far more efficient, even though it
scans both tables twice to combine the results.
Is this some glitch in the query planner?
Cheers,
Today i ran into a situation where a second left join on an indexed field
would prevent the index from being used, even though the index is clearly
more efficient.
Removing either of the 2 joins would cause that the planner will use the
index again.
I tested this on postgres 9.1 and 9.3 on my ubuntu (amd64) laptop.
--Here's the test data:
create table a (id serial primary key, field1 text);
create table b (id integer, title text, lang integer);
create index b_title_lowerto on b using btree (lower(title)
text_pattern_ops);
vacuum analyze;
with x as (
insert into a
select generate_series(1,40000) as id
returning id
)
insert into b
select id, translate((random()*100*id)::text, '1234567890.', 'abcdefghij'),
1
from x;
update a set field1=translate(id::text, '1234567890.', 'abcdefghij');
insert into b
select b2.id, translate((random()*100*b2.id)::text, '1234567890.',
'abcdefghij'), 2
from b b2;
--Here's the query that doesn't use the index on "b":
select a.field1, b1.title , b2.title
from a
left join b b1 on b1.id = a.id and b1.lang=1
left join b b2 on b2.id = a.id and b2.lang=2
where (lower(b1.title) like'abcd%' or lower(b2.title) like 'abcd%')
--plan:
Hash Right Join (cost=4298.60..7214.76 rows=8 width=35)
Hash Cond: (b1.id = a.id)
Filter: ((lower(b1.title) ~~ 'abcd%'::text) OR (lower(b2.title) ~~
'abcd%'::text))
-> Seq Scan on b b1 (cost=0.00..1510.00 rows=40176 width=19)
Filter: (lang = 1)
-> Hash (cost=3798.60..3798.60 rows=40000 width=24)
-> Hash Right Join (cost=1293.00..3798.60 rows=40000 width=24)
Hash Cond: (b2.id = a.id)
-> Seq Scan on b b2 (cost=0.00..1510.00 rows=39824 width=19)
Filter: (lang = 2)
-> Hash (cost=793.00..793.00 rows=40000 width=9)
-> Seq Scan on a (cost=0.00..793.00 rows=40000
width=9)
--Here's the query that does use the index on "b":
select a.field1, b1.title
from a
left join b b1 on b1.id = a.id and b1.lang=1
where lower(b1.title) like 'abcd%'
union
select a.field1, b2.title
from a
left join b b2 on b2.id = a.id and b2.lang=2
where lower(b2.title) like 'abcd%'
--plan:
HashAggregate (cost=98.31..98.39 rows=8 width=20)
-> Append (cost=4.74..98.27 rows=8 width=20)
-> Nested Loop (cost=4.74..49.10 rows=4 width=20)
-> Bitmap Heap Scan on b b1 (cost=4.45..15.82 rows=4
width=19)
Filter: ((lang = 1) AND (lower(title) ~~ 'abcd%'::text))
-> Bitmap Index Scan on b_title_lowerto
(cost=0.00..4.45 rows=3 width=0)
Index Cond: ((lower(title) ~>=~ 'abcd'::text) AND
(lower(title) ~<~ 'abce'::text))
-> Index Scan using a_pkey on a (cost=0.29..8.31 rows=1
width=9)
Index Cond: (id = b1.id)
-> Nested Loop (cost=4.74..49.10 rows=4 width=20)
-> Bitmap Heap Scan on b b2 (cost=4.45..15.82 rows=4
width=19)
Filter: ((lang = 2) AND (lower(title) ~~ 'abcd%'::text))
-> Bitmap Index Scan on b_title_lowerto
(cost=0.00..4.45 rows=3 width=0)
Index Cond: ((lower(title) ~>=~ 'abcd'::text) AND
(lower(title) ~<~ 'abce'::text))
-> Index Scan using a_pkey on a a_1 (cost=0.29..8.31 rows=1
width=9)
Index Cond: (id = b2.id)
As you can see, the second query is far more efficient, even though it
scans both tables twice to combine the results.
Is this some glitch in the query planner?
Cheers,
--
Willy-Bas Loos
Willy-Bas Loos