Postgres Recursive Query#

Suppose we have a hierarchy of industries:

  • root1

    • root1_child1

      • root1_child1_grandchild1

      • root1_child1_grandchild2

    • root1_child2

  • root2

    • root2_child1

And we want to drill down into any of them. For example the children of root1:

  • root1_child1

    • root1_child1_grandchild1

    • root1_child1_grandchild2

  • root1_child2

Here is our initial data:

CREATE TABLE industry
(
    id        SERIAL PRIMARY KEY,
    parent_id INTEGER REFERENCES industry (id),
    name      TEXT
)
;
INSERT INTO industry (id, parent_id, name)
VALUES (1, NULL, 'root1'),
    (2, 1, 'root1_child1'),
    (3, 2, 'root1_child1_grandchild1'),
    (4, 2, 'root1_child1_grandchild2'),
    (5, 1, 'root1_child2'),
    (6, NULL, 'root2'),
    (7, 6, 'root2_child1')
;

And here is the query for root1 (note the criterion WHERE parent_id = 1):

WITH RECURSIVE x AS (
    SELECT  -- anchor
        id,
        parent_id,
        name,
        'base' AS dbg
    FROM industry
    WHERE parent_id = 1  -- criterion
    UNION ALL
    SELECT  -- recursion
        child.id,
        child.parent_id,
        child.name,
        'child' AS dbg
    FROM industry child
             JOIN x ON child.parent_id = x.id
)
SELECT *
FROM x
ORDER BY id
;

This results in the following table:

id

parent_id

name

dbg

2

1

root1_child1

base

3

2

root1_child1_grandchild1

child

4

2

root1_child1_grandchild2

child

5

1

root1_child2

base

Counting#

Suppose we want to show the number of industries.

On the root level, we have root1 with 5 entries (including itself) [1] and root2 with 2 entries:

name

count

root1

5

root2

2

For this to work, we need to include a root_id in our recursive query that we can group by:

WITH RECURSIVE industry_recursive AS (
    SELECT -- anchor
        id,
        parent_id,
        name,
        id AS root_id,
        name AS root_name
    FROM industry
    WHERE parent_id IS NULL -- criterion
    UNION ALL
    SELECT -- recursion
        child.id,
        child.parent_id,
        child.name,
        industry_recursive.root_id,
        industry_recursive.root_name
    FROM industry child
             JOIN industry_recursive ON child.parent_id = industry_recursive.id
)
SELECT
    root_id,
    root_name,
    count(id)
FROM industry_recursive
GROUP BY root_id, root_name
ORDER BY root_id
;

The same for children of root1:

WITH RECURSIVE
    x AS (
        SELECT -- anchor
            id,
            parent_id,
            name,
            id AS root_id,
            name AS root_name
        FROM industry
        WHERE parent_id = 1  -- criterion
        UNION ALL
        SELECT -- recursion
            child.id,
            child.parent_id,
            child.name,
            x.root_id,
            x.root_name
        FROM industry child
                 JOIN x ON child.parent_id = x.id
    )
SELECT root_id, root_name, count(id)
FROM x
GROUP BY root_id, root_name
ORDER BY root_id
;

results in

root_id

root_name

count

2

root1_child1

3

5

root1_child2

1

One Query and IS NULL vs. = NULL#

Due to the three-valued logic (3VL), IS NULL is not the same as = NULL.

To use a single parameter :parent_id that can accept both an INT or NULL, we have to coalesce like this:

WHERE parent_id = :parent_id OR (:parent_id IS NULL AND parent_id IS NULL)

Aggregating Relations#

Suppose that we have a many-to-many relationship between industry and project:

CREATE TABLE project
(
    id   SERIAL PRIMARY KEY,
    name TEXT
)
;
CREATE TABLE project2industry
(
    project_id  INT REFERENCES project,
    industry_id INT REFERENCES industry
)
;

INSERT INTO project (id, name)
VALUES (1, 'p1 in root1'),
    (2, 'p2 in root1'),
    (3, 'p3 in root1_child1'),
    (4, 'p4 in root1_child1'),
    (5, 'p5 in root1_child1_grandchild1'),
    (6, 'p6 in root1_child1_grandchild2'),
    (7, 'p7 in root1_child1_grandchild2'),
    (8, 'p8 in root1_child2'),
    (9, 'p9 in root2'),
    (10, 'p10 in root2_child1')
;

INSERT INTO project2industry (project_id, industry_id)
VALUES (1, 1),
    (2, 1),
    (3, 2),
    (4, 2),
    (5, 3),
    (6, 4),
    (7, 4),
    (8, 5),
    (9, 6),
    (10, 7)
;

Let’s have a look at the flat data:

-- flat
SELECT
    p.name AS project_name,
    i.name AS industry_name
FROM project2industry p2i
         JOIN project p ON p2i.project_id = p.id
         JOIN industry i ON p2i.industry_id = i.id
;

To count the projects by industry recursively:

-- recursive project count for top level industries
WITH RECURSIVE industry_recursive AS (
    SELECT -- anchor
        id,
        parent_id,
        name,
        id AS root_id,
        name AS root_name
    FROM industry
    WHERE parent_id = :parent_id
        OR (:parent_id IS NULL AND parent_id IS NULL)
    UNION ALL
    SELECT -- recursion
        child.id,
        child.parent_id,
        child.name,
        industry_recursive.root_id,
        industry_recursive.root_name
    FROM industry child
             JOIN industry_recursive ON child.parent_id = industry_recursive.id
)
SELECT
    root_id,
    root_name,
    count(p2i.project_id) AS project_count,
    array_agg(p2i.project_id) AS project_ids -- helpful to check plausibility
FROM industry_recursive
         LEFT JOIN project2industry p2i ON industry_recursive.id = p2i.industry_id
GROUP BY root_id, root_name
;

Further Reading#

Notes#

  • In Pycharm, you have to type NULL for the :parent_id parameter. Leaving it at the value <null> does not work.