Profiling columns to determine datatype and top 10 COMPRESS values

Database
Enthusiast

Profiling columns to determine datatype and top 10 COMPRESS values

I wrote this query to help profile columns as I occasionally import text files using Fastload into a one-column table, then split that table using CSVLD into VARCHAR(100) columns.

 

Hope it helps, feel free to add enhancements to the REGEXs etc.

SELECT
--Add Leading comma for second and subsequent recordsCASE WHEN RowNbr = 1 THEN 
    CASE WHEN Max_InferredType = 1 THEN 
            CASE WHEN Min_Numeric_Val >= -128 AND Max_Numeric_Val <= 127 THEN 'BYTEINT'
                WHEN Min_Numeric_Val >= -32768 AND Max_Numeric_Val <= 32767 THEN 'SMALLINT'
                WHEN Min_Numeric_Val >= -2147483648 AND Max_Numeric_Val <= 2147483647 THEN 'INTEGER'
                ELSE 'BIGINT' END
            WHEN Max_InferredType = 2 THEN 'DECIMAL(' || TRIM(DigitPlaces) || ',' || TRIM(DecimalPlaces) || ')'
            WHEN Max_InferredType IN (3,4) THEN 'DATE FORMAT ''YYYY-MM-DD'''
            ELSE 'VARCHAR(' || TRIM(Max_Length) || ') CHARACTER SET LATIN NOT CASESPECIFIC' END
    || ' COMPRESS('
    ELSE '' END ||
    CASE WHEN Max_InferredType IN (1,2) OR Char_Val IS NULL THEN '' ELSE '''' END || COALESCE(Char_Val,'') || CASE WHEN Max_InferredType IN (1,2) OR Char_Val IS NULL THEN '' ELSE '''' END ||
    --For last row add end parenthesis otherwise if Value is not null add trailing comma    CASE WHEN RowNbr = MAX(RowNbr) OVER () THEN '),' ELSE CASE WHEN Char_Val IS NULL THEN '' ELSE ',' END END AS Val    ,FieldCount
FROM (SELECT
ROW_NUMBER() OVER (ORDER BY FieldCount DESC) AS RowNbr,Char_Val,MAX(Len) OVER () AS Max_Length, MAX(InferredType) OVER() AS Max_InferredType,MAX(Char_Val) OVER () AS Max_Char_Val, MIN(Char_Val) OVER () AS Min_Char_Val,MAX(Numeric_Val) OVER () AS Max_Numeric_Val, MIN(Numeric_Val) OVER () AS Min_Numeric_Val,MAX(DigitPlaces) OVER () AS DigitPlaces, MAX(DecimalPlaces) OVER () AS DecimalPlaces,MAX(Date_Val) OVER () AS Max_Date_Val, MIN(Date_Val) OVER () AS Min_Date_Val,FieldCount
FROM (--If Numeric, don't enclose value in single-quoteSELECT
Char_Val,CHARACTER_LENGTH(Char_Val) AS Len,CASE WHEN InferredType IN (1,2) THEN CAST(Char_Val AS DECIMAL(18,1)) END AS Numeric_Val,CASE WHEN InferredType = 2 THEN CHARACTER_LENGTH(REGEXP_REPLACE(Char_Val,'[-|+|,|\.]','',1,0,'m')) END AS DigitPlaces,CASE WHEN InferredType = 2 THEN CHARACTER_LENGTH(REGEXP_REPLACE(Char_Val,'^[-+]?([0-9]*,?)*\.','',1,0,'m')) END AS DecimalPlaces,CASE WHEN InferredType = 3 THEN CAST(Char_Val AS DATE FORMAT 'DD/MM/YYYY')
    WHEN InferredType = 4 THEN CAST(Char_Val AS DATE FORMAT 'YYYY-MM-DD') END AS Date_Val
--,REGEXP_INSTR(Char_Val,'^[-+]?([0-9]*,?)*\.?[0-9]*$') AS IntRegex,REGEXP_REPLACE(Char_Val,'[-|+|,\.]','',1,0,'m') AS DigitsRegex,REGEXP_REPLACE(Char_Val,'^[-+]?([0-9]*,?)*\.','',1,0,'m') AS DecimalRegex
--,REGEXP_INSTR(Char_Val,'^\d{1,2}\/\d{1,2}\/\d{4}$') AS DateRegex
--,REGEXP_INSTR(Char_Val,'^\d{4}-\d{2}-\d{2}$') AS DateRegex2,CASE WHEN Char_Val IS NULL THEN 0
    WHEN REGEXP_INSTR(Char_Val,'^[-+]?([0-9]*,?)*.?$') = 1 THEN 1 --Integer    WHEN REGEXP_INSTR(Char_Val,'^[-+]?([0-9]*,?)*\.[0-9]*$') = 1 THEN 2 --Decimal/Numeric/Float    WHEN REGEXP_INSTR(Char_Val,'^\d{1,2}\/\d{1,2}\/\d{4}$') = 1 THEN 3 --Date dd/mm/yyyy    WHEN REGEXP_INSTR(Char_Val,'^\d{4}-\d{2}-\d{2}$') = 1 THEN 4 --Date yyyy-mm-dd    --Hour and Timestamp Regular Expressions might be tricky due to the many different formats that might be used here are some samples    --WHEN REGEXP_INSTR(Field,'^\d{2}:\d{2}(:\d+.?\d?)?$') = 1 THEN 'HH:MM'    --WHEN REGEXP_INSTR(Field,'^\d{2}:\d{2}(:\d+.?\d?)?$') = 1 THEN 'TIMESTAMP'    ELSE 5 END AS InferredType --Char,FieldCount
FROM (--Aggregate first so we don't have to perform as many REGEXP_INSTR calculationsSELECT COUNT(*) AS FieldCount, TRIM(ColumnName) AS Char_Val
FROM TableName
GROUP BY 2) A
--WHERE Len = 10
--WHERE Inferred_Type = 2
--WHERE DigitPlaces = 10) B
QUALIFY RowNbr <= 10) C
ORDER BY RowNbr