ASTER EXPRESS: WEBINAR: Multi-Genre Analytics: Scripts

Learn Data Science
Teradata Employee

Please review the full script for the Webinar on April 19, 2016.

--PLEASE FOLLOW THE LINK BELOW TO INSTALL THE ANALYTICS NECESSARY:

ASTER EXPRESS: WEBINAR: Multi-Genre Analytics: Analytics to Install

--BELOW IS THE FULL SCRIPT FROM TODAY'S WEBINAR.

SELECT * FROM public.bank_web_clicks limit 100;

SELECT COUNT(*) FROM public.bank_web_clicks;

--------------------------------------------------

--SESSIONIZE THE DATA

--------------------------------------------------

SELECT * FROM public.bank_web_clicks_nonses order by customer_id, datestamp LIMIT 100;

DROP TABLE IF EXISTS bank_web_clicks_ses;

CREATE TABLE public.bank_web_clicks_ses DISTRIBUTE BY HASH(customer_id) AS

SELECT * FROM SESSIONIZE(

ON bank_web_clicks_nonses

PARTITION BY customer_id

ORDER BY datestamp

TIMECOLUMN('datestamp')

TIMEOUT('350')

) ORDER BY customer_id, datestamp;

SELECT * FROM bank_web_clicks_ses ORDER BY customer_id, sessionid LIMIT 100;

--------------------------------------------------

--SESSIONIZE THE DATA

--------------------------------------------------

CREATE TABLE bank_web_paths_top40 DISTRIBUTE BY REPLICATION AS

SELECT DISTINCT path, count(*) cnt

    FROM nPath (

    ON bank_web_clicks

    PARTITION BY customer_id, session_id

    ORDER BY datestamp

    MODE(NONOVERLAPPING)

    PATTERN ('PAGE+')

    SYMBOLS

    (

    TRUE AS PAGE

    )

    RESULT (

    ACCUMULATE (page OF ANY (PAGE)) AS path

    )

    )n

    GROUP BY 1

    ORDER BY 2 DESC

    LIMIT 40;

---------------------------------------------------

--BUILD MY TRAINING DATA SET

---------------------------------------------------

DROP TABLE IF EXISTS public.bank_web_training;

CREATE TABLE public.bank_web_training DISTRIBUTE BY REPLICATION AS

SELECT DISTINCT customer_id, path, 'yes_enroll' as enrollstat

    FROM nPath (

    ON bank_web_clicks

    PARTITION BY customer_id, session_id

    ORDER BY datestamp

   MODE(NONOVERLAPPING)

    PATTERN ('PAGE{4}+.BME')

    SYMBOLS

    (

    TRUE AS PAGE,

    page = 'BILL MANAGER ENROLLMENT' AS BME

    )

    RESULT (

    FIRST(customer_id OF ANY(PAGE, BME)) as customer_id,

    ACCUMULATE (page OF ANY (PAGE, BME)) AS path

    )

    )n;

INSERT INTO public.bank_web_training

SELECT DISTINCT customer_id, path, 'no_enroll' as enrollstat

    FROM nPath (

    ON bank_web_clicks

    PARTITION BY customer_id, session_id

    ORDER BY datestamp

    MODE(NONOVERLAPPING)

    PATTERN ('PAGE{4}+.BMF.NBME')

    SYMBOLS

    (

    TRUE AS PAGE,

    page = 'BILL MANAGER FORM' AS BMF,

    page <> 'BILL MANAGER ENROLLMENT' AS NBME

    )

    RESULT (

    FIRST(customer_id OF ANY(PAGE, BMF, NBME)) as customer_id,

    ACCUMULATE (page OF ANY (PAGE, BMF, NBME)) AS path

    )

    )n;

SELECT * FROM public.bank_web_training;

----------------------------------------------------

--CREATE TEST DATA

----------------------------------------------------

DROP TABLE IF EXISTS public.bank_web_test;

CREATE TABLE public.bank_web_test DISTRIBUTE BY REPLICATION AS

SELECT DISTINCT customer_id, path

    FROM nPath (

    ON bank_web_clicks

    PARTITION BY customer_id, session_id

    ORDER BY datestamp

    MODE(NONOVERLAPPING)

    PATTERN ('PAGE{4}+')

    SYMBOLS

    (

    TRUE AS PAGE

    )

    RESULT (

    FIRST(customer_id OF ANY(PAGE)) as customer_id,

    ACCUMULATE (page OF ANY (PAGE)) AS path

    )

    )n;

----------------------------------------------------

--BUILD MY PREDICTIVE MODEL USING NAIVE BAYES

----------------------------------------------------

DROP TABLE IF EXISTS public.bank_web_enroll_model;

CREATE TABLE public.bank_web_enroll_model DISTRIBUTE BY HASH(token) AS

SELECT token, SUM(yes_enroll) AS yes_enroll, SUM(no_enroll) AS no_enroll

FROM naiveBayesText(

ON bank_web_training

TEXT_COLUMN( 'path' )

CATEGORY_COLUMN( 'enrollstat' )

CATEGORIES( 'yes_enroll', 'no_enroll' )

DELIMITER('[,]')

PUNCTUATION('[\\[\\] ]+')) GROUP BY token;

SELECT * FROM public.bank_web_enroll_model;

----------------------------------------------------

--IMPLEMENT THE MODEL USING TEST DATA

----------------------------------------------------

DROP TABLE IF EXISTS public.bank_web_predict;

CREATE TABLE public.bank_web_predict DISTRIBUTE BY HASH(customer_id) AS

SELECT * FROM naiveBayesTextPredict_mi(

    ON bank_web_test AS "DATA"  PARTITION BY ANY

    ON bank_web_enroll_model AS "MODEL" DIMENSION

    categories('yes_enroll', 'no_enroll' )

    text_column('path')

    delimiter(',')

    punctuation('[\\[\\] ]+')

);

SELECT FROM bank_web_predict LIMIT 1000;