Postgres Recursive Query#
Suppose we have a hierarchy of industries:
root1
root1_child1
root1_child1_grandchild1
root1_child1_grandchild2
root1_child2
root2
root2_child1
And we want to drill down into any of them.
For example the children of root1
:
root1_child1
root1_child1_grandchild1
root1_child1_grandchild2
root1_child2
Here is our initial data:
CREATE TABLE industry
(
id SERIAL PRIMARY KEY,
parent_id INTEGER REFERENCES industry (id),
name TEXT
)
;
INSERT INTO industry (id, parent_id, name)
VALUES (1, NULL, 'root1'),
(2, 1, 'root1_child1'),
(3, 2, 'root1_child1_grandchild1'),
(4, 2, 'root1_child1_grandchild2'),
(5, 1, 'root1_child2'),
(6, NULL, 'root2'),
(7, 6, 'root2_child1')
;
And here is the query for root1
(note the criterion WHERE parent_id = 1
):
WITH RECURSIVE x AS (
SELECT -- anchor
id,
parent_id,
name,
'base' AS dbg
FROM industry
WHERE parent_id = 1 -- criterion
UNION ALL
SELECT -- recursion
child.id,
child.parent_id,
child.name,
'child' AS dbg
FROM industry child
JOIN x ON child.parent_id = x.id
)
SELECT *
FROM x
ORDER BY id
;
This results in the following table:
id |
parent_id |
name |
dbg |
---|---|---|---|
2 |
1 |
root1_child1 |
base |
3 |
2 |
root1_child1_grandchild1 |
child |
4 |
2 |
root1_child1_grandchild2 |
child |
5 |
1 |
root1_child2 |
base |
Counting#
Suppose we want to show the number of industries.
On the root level, we have root1
with 5 entries (including itself) [1] and root2
with 2 entries:
name |
count |
---|---|
root1 |
5 |
root2 |
2 |
For this to work, we need to include a root_id
in our recursive query that we
can group by:
WITH RECURSIVE industry_recursive AS (
SELECT -- anchor
id,
parent_id,
name,
id AS root_id,
name AS root_name
FROM industry
WHERE parent_id IS NULL -- criterion
UNION ALL
SELECT -- recursion
child.id,
child.parent_id,
child.name,
industry_recursive.root_id,
industry_recursive.root_name
FROM industry child
JOIN industry_recursive ON child.parent_id = industry_recursive.id
)
SELECT
root_id,
root_name,
count(id)
FROM industry_recursive
GROUP BY root_id, root_name
ORDER BY root_id
;
The same for children of root1
:
WITH RECURSIVE
x AS (
SELECT -- anchor
id,
parent_id,
name,
id AS root_id,
name AS root_name
FROM industry
WHERE parent_id = 1 -- criterion
UNION ALL
SELECT -- recursion
child.id,
child.parent_id,
child.name,
x.root_id,
x.root_name
FROM industry child
JOIN x ON child.parent_id = x.id
)
SELECT root_id, root_name, count(id)
FROM x
GROUP BY root_id, root_name
ORDER BY root_id
;
results in
root_id |
root_name |
count |
---|---|---|
2 |
root1_child1 |
3 |
5 |
root1_child2 |
1 |
One Query and IS NULL
vs. = NULL
#
Due to the three-valued logic (3VL), IS NULL
is not the same as = NULL
.
To use a single parameter :parent_id
that can accept both an INT or NULL, we have to coalesce like this:
WHERE parent_id = :parent_id OR (:parent_id IS NULL AND parent_id IS NULL)
Aggregating Relations#
Suppose that we have a many-to-many relationship between industry and project:
CREATE TABLE project
(
id SERIAL PRIMARY KEY,
name TEXT
)
;
CREATE TABLE project2industry
(
project_id INT REFERENCES project,
industry_id INT REFERENCES industry
)
;
INSERT INTO project (id, name)
VALUES (1, 'p1 in root1'),
(2, 'p2 in root1'),
(3, 'p3 in root1_child1'),
(4, 'p4 in root1_child1'),
(5, 'p5 in root1_child1_grandchild1'),
(6, 'p6 in root1_child1_grandchild2'),
(7, 'p7 in root1_child1_grandchild2'),
(8, 'p8 in root1_child2'),
(9, 'p9 in root2'),
(10, 'p10 in root2_child1')
;
INSERT INTO project2industry (project_id, industry_id)
VALUES (1, 1),
(2, 1),
(3, 2),
(4, 2),
(5, 3),
(6, 4),
(7, 4),
(8, 5),
(9, 6),
(10, 7)
;
Let’s have a look at the flat data:
-- flat
SELECT
p.name AS project_name,
i.name AS industry_name
FROM project2industry p2i
JOIN project p ON p2i.project_id = p.id
JOIN industry i ON p2i.industry_id = i.id
;
To count the projects by industry recursively:
-- recursive project count for top level industries
WITH RECURSIVE industry_recursive AS (
SELECT -- anchor
id,
parent_id,
name,
id AS root_id,
name AS root_name
FROM industry
WHERE parent_id = :parent_id
OR (:parent_id IS NULL AND parent_id IS NULL)
UNION ALL
SELECT -- recursion
child.id,
child.parent_id,
child.name,
industry_recursive.root_id,
industry_recursive.root_name
FROM industry child
JOIN industry_recursive ON child.parent_id = industry_recursive.id
)
SELECT
root_id,
root_name,
count(p2i.project_id) AS project_count,
array_agg(p2i.project_id) AS project_ids -- helpful to check plausibility
FROM industry_recursive
LEFT JOIN project2industry p2i ON industry_recursive.id = p2i.industry_id
GROUP BY root_id, root_name
;
Further Reading#
https://indrajith.me/posts/recursive-queries-in-postgresql-for-hierarchial-data/
paths (initialize an array on anchor and append in the recursion)
Notes#
In Pycharm, you have to type
NULL
for the:parent_id
parameter. Leaving it at the value<null>
does not work.