Statistical Combination in SQL

Database
The Teradata Database channel includes discussions around advanced Teradata features such as high-performance parallel database technology, the optimizer, mixed workload management solutions, and other related technologies.
Enthusiast

Statistical Combination in SQL

May I arouse your curiosity with a little SQL brain teaser?
I need to solve this (something simular) in my project…

Next to my gratitude the solver will gain eternal fame!

Thanks, Carl

 

The question:
Every year the local commune organises a summer camp for children.
Each year, EACH family can send ONE child to summer camp. (=> the number of children in the camp will be equal to the number of families)
I search a (single?) query that will return the POSSIBLE groups for the summer camp. (Keep in mind... the number of families can change, also the children of each family can change.)

 

--Source table
CREATE TABLE DB.FamilyAndChildren AS (
SELECT 'Family1' AS Family, 'Carl' (VARCHAR(32)) AS child, 1 AS ChildId FROM (SELECT 1 AS One) T
UNION SELECT 'Family1','Tyagi' ,2 FROM (SELECT 1 AS One) T
UNION SELECT 'Family2','Jan' ,1 FROM (SELECT 1 AS One) T
UNION SELECT 'Family2','Claudio' ,2 FROM (SELECT 1 AS One) T
UNION SELECT 'Family2','Mieke' ,3 FROM (SELECT 1 AS One) T
UNION SELECT 'Family3','Youri' ,1 FROM (SELECT 1 AS One) T
UNION SELECT 'Family4','Dirk' ,1 FROM (SELECT 1 AS One) T
UNION SELECT 'Family4','Luisa' ,2 FROM (SELECT 1 AS One) T
UNION SELECT 'Family5','Fred' ,1 FROM (SELECT 1 AS One) T
UNION SELECT 'Family5','Lillo' ,2 FROM (SELECT 1 AS One) T
UNION SELECT 'Family5','Oliver' ,3 FROM (SELECT 1 AS One) T
UNION SELECT 'Family6','Erhan' ,1 FROM (SELECT 1 AS One) T
UNION SELECT 'Family6','Olivier' ,2 FROM (SELECT 1 AS One) T
) WITH DATA;
select * from DB.FamilyAndChildren order by Family, ChildId;
select Family from DB.FamilyAndChildren GROUP BY Family;

 

SOURCE:
Family child ChildId
'Family1' 'Carl' 1
'Family1' 'Tyagi' 2
'Family2' 'Jan' 1
'Family2' 'Claudio' 2
'Family2' 'Mieke' 3
'Family3' 'Youri' 1
'Family4' 'Dirk' 1
'Family4' 'Luisa' 2
'Family5' 'Fred' 1
'Family5' 'Lillo' 2
'Family5' 'Oliver' 3
'Family6' 'Erhan' 1
'Family6' 'Olivier' 2

 

--And now I search a SQL that will generate this result:
RESULT:
PossibleGroup Family Child
1, 'Family1', 'Carl'
1, 'Family2', 'Jan'
1, 'Family3', 'Youri'
1, 'Family4', 'Dirk'
1, 'Family5', 'Fred'
1, 'Family6', 'Erhan'

2, 'Family1', 'Carl'
2, 'Family2', 'Claudio'
2, 'Family3', 'Youri'
2, 'Family4', 'Dirk'
2, 'Family5', 'Fred'
2, 'Family6', 'Erhan'

3, 'Family1', 'Carl'
3, 'Family2', 'Mieke'
3, 'Family3', 'Youri'
3, 'Family4', 'Dirk'
3, 'Family5', 'Fred'
3, 'Family6', 'Erhan'
...etc

 

Tags (1)

Accepted Solutions
Teradata Employee

Re: Statistical Combination in SQL

Side note, if you're in 15.10, reverse the order of the CTE-s (they are read from bottom to top).

1 ACCEPTED SOLUTION
10 REPLIES 10
Teradata Employee

Re: Statistical Combination in SQL

 

Hi crochlit,

 

Is a stored procedure acceptable ?

The regular SQL query depends on the number of families here and it can be kinda easily built using a stored procedure :

replace procedure DB.sp_generate_combi()
dynamic result sets 1
sql security invoker
mp:
begin
    declare v$_sql              varchar(8000);
    declare v$_rn_chr           varchar(   3);
    declare v$_sql_select       varchar(1000);
    declare v$_sql_from         varchar(2000);
    declare v$_sql_where        varchar( 500);
    declare v$_sql_upv_cst      varchar( 100);
    declare v$_sql_upv_lst      varchar(2000);
    
    l$_fc:
    for cur_col as colcur cursor for
    (
          select Family
               , row_number() over(order by Family) as rn
            from DB.FamilyAndChildren
        group by Family
        order by rn asc
    )
    do
        set mp.v$_rn_chr = to_char(cur_col.rn, 'fm999');
        
        case cur_col.rn
          when 1
          then set mp.v$_sql_select  = 'with cte_data as' || chr(10) || '(' || chr(10) || 'select row_number() over(order by null asc) as rn' || chr(10) || '     , fm1.Child as c1';
               set mp.v$_sql_from    = chr(10) || '  from DB.FamilyAndChildren as fm1';
               set mp.v$_sql_where   = chr(10) || ' where fm1.Family = ''' || cur_col.Family || '''' || chr(10) || ')' || chr(10);
               set mp.v$_sql_upv_lst = '( c1 as ''' || cur_col.Family || '''' || chr(10);
          else set mp.v$_sql_select  = mp.v$_sql_select  || chr(10) || '     , fm' || mp.v$_rn_chr || '.Child as c' || mp.v$_rn_chr;
               set mp.v$_sql_from    = mp.v$_sql_from    || chr(10) || '  join DB.FamilyAndChildren as fm' || mp.v$_rn_chr || ' on fm' || mp.v$_rn_chr || '.Family = ''' || cur_col.Family || '''';
               set mp.v$_sql_upv_lst = mp.v$_sql_upv_lst || '                              , c' || mp.v$_rn_chr || ' as ''' || cur_col.Family || '''' || chr(10);
        end case;
        
    end for l$_fc;
    
    set mp.v$_sql_upv_cst = '  select *' || chr(10) || '    from cte_data' || chr(10) || ' unpivot (Child for Family in ';
    
    set mp.v$_sql = mp.v$_sql_select || mp.v$_sql_from || mp.v$_sql_where
                 || mp.v$_sql_upv_cst || mp.v$_sql_upv_lst
                 || '                              )) as upv;';
    
    ds:
    begin
        declare exec_query scroll cursor with return for sql_statement;
        prepare sql_statement from mp.v$_sql;
        open exec_query;
    end ds;

end mp;

call DB.sp_generate_combi();

 

The query that is built and executed by the procedure :

with cte_data as
(
select row_number() over(order by null asc) as rn
     , fm1.Child as c1
     , fm2.Child as c2
     , fm3.Child as c3
     , fm4.Child as c4
     , fm5.Child as c5
     , fm6.Child as c6
  from DB.FamilyAndChildren as fm1
  join DB.FamilyAndChildren as fm2 on fm2.Family = 'Family2'
  join DB.FamilyAndChildren as fm3 on fm3.Family = 'Family3'
  join DB.FamilyAndChildren as fm4 on fm4.Family = 'Family4'
  join DB.FamilyAndChildren as fm5 on fm5.Family = 'Family5'
  join DB.FamilyAndChildren as fm6 on fm6.Family = 'Family6'
 where fm1.Family = 'Family1'
)
  select *
    from cte_data
 unpivot (Child for Family in ( c1 as 'Family1'
                              , c2 as 'Family2'
                              , c3 as 'Family3'
                              , c4 as 'Family4'
                              , c5 as 'Family5'
                              , c6 as 'Family6'
                              )) as upv;

 

 

Enthusiast

Re: Statistical Combination in SQL

Thanks Waldar! Interesting solution, but I am still searching/hoping for a "pure SQL" solution.
Which will -to my feeling- involve recusivity and a (some kind of) cross join.

KR, Carl

Visitor

Re: Statistical Combination in SQL

 
Teradata Employee

Re: Statistical Combination in SQL

Indeed, (fake) cross joins and recursivity does the trick :

with cte_family (Family, Child, NoFamily, NoFamilyDesc) as
(
select Family
     , Child
     , dense_rank() over(order by Family  asc)
     , dense_rank() over(order by Family desc)
  from DB.FamilyAndChildren
)
  , recursive cte_recurs (All_lst, NoFamily, NoFamilyDesc) as
(
select Family || '$' || Child (varchar(1000)), NoFamily, NoFamilyDesc
  from cte_family
 where NoFamily = 1
 union all
select rec.All_lst || ';' || cte.Family || '$' || cte.Child
     , cte.NoFamily, cte.NoFamilyDesc
  from cte_recurs as rec
  join cte_family as cte on cte.NoFamily = rec.NoFamily + 1
)
  ,  cte_Child_lst (rn, All_lst) as
(
select row_number() over(order by All_lst asc) as rn
     , All_lst
  from cte_recurs
 where NoFamilyDesc = 1
)
select d.outkey                as PossibleGroup
     , strtok(d.token, '$', 1) as Family
     , strtok(d.token, '$', 2) as Child
  from table( strtok_split_to_table(cte_Child_lst.rn, cte_Child_lst.All_lst, ';')
              returns (outkey integer, tokennum integer, token varchar(65) character set unicode) ) as d
order by 1, 2;
Teradata Employee

Re: Statistical Combination in SQL

Side note, if you're in 15.10, reverse the order of the CTE-s (they are read from bottom to top).

Enthusiast

Re: Statistical Combination in SQL

Hi Waldar,

Although this solution fulfills the challenge posed, it is not applicable in a operational environment. It immediately hits spoolspace, I/O, CPU,... limits.
Taking a closer look at the solution, it builds-up and de-composes a string.

Is there a solution that consists only of pure SQL features?

 

@dnoeth would you be willing to have a look at this?
(I just found out you have a recommendation from Joe Celko on Twitter... Very Impressed!)


Kind regards, Carl

Teradata Employee

Re: Statistical Combination in SQL

Hi Carl,

 

Obvisouly this solution depends on the number of families / childs you have, it's a" number of childs" power "number of families" problem and you want all the combinations.

 

Did you calculate how much data you're expecting ?

This is the query to do so :

with cte_Family_stats (FamilyId, NbChilds) as
(
  select Family, count(*)
    from DB.FamilyAndChildren
group by Family
)
select count(*)                   as NbFamilies
     , exp(sum(ln(NbChilds)))     as NbCombinations
     , NbCombination * NbFamilies as NbRows
  from cte_Family_stats;

 

The good thing with the query I provided is you can split the recursive portion in n-parts :

create multiset volatile table mvt_result1, no log
as
(
with recursive cte_recurs (All_lst, NoFamily, NoFamilyDesc) as
(
select Family || '$' || Child (varchar(1000)), NoFamily, NoFamilyDesc
  from cte_family
 where NoFamily = 1
 union all
select rec.All_lst || ';' || cte.Family || '$' || cte.Child
     , cte.NoFamily, cte.NoFamilyDesc
  from cte_recurs as rec
  join cte_family as cte on cte.NoFamily = rec.NoFamily + 1
)
  ,  cte_family (Family, Child, NoFamily, NoFamilyDesc) as
(
select Family
     , Child
     , dense_rank() over(order by Family  asc)
     , dense_rank() over(order by Family desc)
  from DB.FamilyAndChildren
 where Family <= 'Family3'
)
select All_lst
  from cte_recurs
 where NoFamilyDesc = 1
)
with data
no primary index
on commit preserve rows;


create multiset volatile table mvt_result2, no log
as
(
with recursive cte_recurs (All_lst, NoFamily, NoFamilyDesc) as
(
select Family || '$' || Child (varchar(1000)), NoFamily, NoFamilyDesc
  from cte_family
 where NoFamily = 1
 union all
select rec.All_lst || ';' || cte.Family || '$' || cte.Child
     , cte.NoFamily, cte.NoFamilyDesc
  from cte_recurs as rec
  join cte_family as cte on cte.NoFamily = rec.NoFamily + 1
)
  ,  cte_family (Family, Child, NoFamily, NoFamilyDesc) as
(
select Family
     , Child
     , dense_rank() over(order by Family  asc)
     , dense_rank() over(order by Family desc)
  from DB.FamilyAndChildren
 where Family > 'Family3'
)
select All_lst
  from cte_recurs
 where NoFamilyDesc = 1
)
with data
no primary index
on commit preserve rows;


with cte_AllFamilies (rn, All_lst) as
(
select row_number() over(order by r1.All_lst asc, r2.All_lst asc)
     , r1.All_lst || ';' || r2.All_lst
  from mvt_result1 as r1
     , mvt_result2 as r2
)
select d.outkey                as PossibleGroup
     , strtok(d.token, '$', 1) as Family
     , strtok(d.token, '$', 2) as Child
  from table( strtok_split_to_table(cte_AllFamilies.rn, cte_AllFamilies.All_lst, ';')
              returns (outkey integer, tokennum integer, token varchar(65) character set unicode) ) as d
order by 1, 2;

But this is no longer "one" query.

 

I still think the "best" running query is the one built by the stored procedure in my first answer.

 

 

Ambassador

Re: Statistical Combination in SQL

This calculates the final number of combinations and uses EXPAND ON to create the correct number of rows per child and to assign sequential numbers.

Then those numbers are shuffled to create all possible groups of unique combinations:

 

SELECT dt.*
-- maybe this calculation can be simplified? ,(rownum / grp_size * child_cnt + (child_num-1)) * grp_size + (rownum MOD grp_size) +1 AS PossibleGroup FROM ( SELECT dt.* ,(End(pd) - DATE '0001-01-01') - 1 AS rownum FROM ( SELECT dt.* -- all combinations = number of possible groups ,Cast(Round(Exp(Sum(Ln(CASE WHEN child_num = 1 THEN child_cnt end)) Over ())) AS INT) AS #Combinations -- rows per group ,Cast(Round(Exp(Sum(Ln(CASE WHEN child_num = 1 THEN child_cnt end)) Over (ORDER BY child_cnt, family, child_num DESC ROWS BETWEEN CURRENT ROW AND Unbounded Following))) AS INT) / child_cnt AS grp_size FROM ( SELECT Family ,child -- number of children per family ,Count(*) Over (PARTITION BY Family) AS child_cnt -- 1st, 2nd, 3rd child -- childid might already match this calculation ,Row_Number() Over (PARTITION BY Family ORDER BY childid) AS child_num FROM FamilyAndChildren
-- families with a single child will not increase the number of combinations
QUALIFY child_cnt > 1 ) AS dt ) AS dt -- create all rows for a child EXPAND ON PERIOD(DATE '0001-01-01', DATE '0001-01-01' + (#Combinations / child_cnt)) AS pd ) AS dt

 

 

But @Waldar is right, of course, the number of combinations/rows grows very fast, the cast to INT is a built-in limit to 2.1 billion rows :-)

At least the families with a single child can be ignored...

Ambassador

Re: Statistical Combination in SQL

Ooops, the EXPAND based on dates limits the number of rows to 3.6 million, must switch to timestamps to increase it to 2.1 billion:

SELECT dt.*
  ,Cast(Cast((End(pd) - TIMESTAMP '0001-01-01 00:00:00' Second(4)) AS DECIMAL(10,6)) * 1000000 AS INT) - 1  AS rownum
  -- maybe this calculation can be simplified?
  ,(rownum / grp_size * child_cnt + (child_num-1)) * grp_size + (rownum MOD grp_size) +1 AS PossibleGroup  
FROM
 (
   SELECT dt.*, pd
   FROM
    (
      SELECT dt.*
        -- all combinations = number of possible groups
        ,Cast(Round(Exp(Sum(Ln(CASE WHEN child_num = 1 THEN child_cnt end))
                        Over ())) AS INT) AS #Combinations
        -- rows per group
        ,Cast(Round(Exp(Sum(Ln(CASE WHEN child_num = 1 THEN child_cnt end))
                        Over (ORDER BY child_cnt, family, child_num DESC
                              ROWS BETWEEN CURRENT ROW AND Unbounded Following))) AS INT) / child_cnt AS grp_size
      FROM 
       (
         SELECT
            Family
           ,child
           -- number of children per family
           ,Count(*) Over (PARTITION BY Family) AS child_cnt
           -- 1st, 2nd, 3rd, ... child
           ,Row_Number() Over (PARTITION BY Family ORDER BY childid) AS child_num
         FROM FamilyAndChildren
         -- families with a single child will not increase the number of combinations
         QUALIFY child_cnt > 1
       ) AS dt
    ) AS dt
   -- create all rows for a child
   EXPAND ON PERIOD(TIMESTAMP '0001-01-01 00:00:00', TIMESTAMP '0001-01-01 00:00:00.000000' + ((#Combinations / child_cnt) * INTERVAL '0.000001' SECOND)) AS pd
 ) AS dt