Super Sweet nPath Examples... With Source Code

Learn Data Science
Teradata Employee

Determine the average, minimum, maximum number of days between visits and the count of the total number of visits by gender and frequent buyer level for the months of June and July 2008. Marketing would like this information sorted by gender and frequent buyer level.


SELECT * FROM nPath( ON prod.sales_fact

PARTITION BY customer_id

ORDER BY sales_date

SYMBOLS (true AS A)

PATTERN ('^A')

RESULT (FIRST(customer_id of A) as customer_id

   , FIRST(sales_date of A) as first_sales_date

   , FIRST(product_id of A) as first_product_purchased)

MODE(NONOVERLAPPING));


Change the pattern so that instead we get the second product purchased (but still the date of the first sale).


SELECT * FROM nPath( ON prod.sales_fact

PARTITION BY customer_id

ORDER BY sales_date

SYMBOLS (true AS A)

PATTERN ('^A.A')

RESULT (FIRST(customer_id of A) as customer_id

, FIRST(sales_date of A) as first_sales_date

, LAST(product_id of A) as second_product_purchased)

MODE(NONOVERLAPPING));

Requirement 3:

Determine the average, minimum, maximum number of days between visits and the count of the total number of visits by gender and frequent buyer level for the months of June and July 2008. Marketing would like this information sorted by gender and frequent buyer level.


SELECT * FROM nPath( ON prod.sales_fact

PARTITION BY customer_id

ORDER BY sales_date

SYMBOLS (true AS A, true as B)

PATTERN ('^A.B')

RESULT (FIRST(customer_id of A) as customer_id

   , FIRST(sales_date of A) as first_sales_date

   , FIRST(product_id of B) as second_product_purchased)

MODE(NONOVERLAPPING));


We would like to know if the customer discounts are having any effect on customer visits. We'll look to see if having a large discount (greater than .10 cents) leads to a greater number of additional purchases made at the store. Specifically, we want to know the date of the first large discount event ( > .10), the size of the discount, and the total number of unique products purchased after that discount. First, construct an nPath query that returns the total number of products purchased after receiving a discount, and the total number of product purchased before receiving a discount.


SELECT * FROM nPath( ON prod.sales_fact

PARTITION BY customer_id

ORDER BY sales_date

PATTERN('^(NODISCOUNT*).(DISCOUNT).(A*)')

SYMBOLS(discount_amount < .10 as NODISCOUNT, discount_amount >= .10 as DISCOUNT, TRUE as A)

RESULT (FIRST(customer_id of DISCOUNT) as customer_id

   , FIRST(sales_date of DISCOUNT) as discount_date

   , FIRST(discount_amount of DISCOUNT) as first_large_discount

   , COUNT(* of A) as post_discount_visits

   , COUNT(* of NODISCOUNT) AS pre_discount_visits)

MODE(NONOVERLAPPING));

What would happen if you replaced FIRST(customer_id of DISCOUNT) with FIRST(customer_id of NODISCOUNT) ?

SELECT * FROM nPath( ON prod.sales_fact

PARTITION BY customer_id

ORDER BY sales_date

PATTERN('^(NODISCOUNT*).(DISCOUNT).(A*)')

SYMBOLS(discount_amount < .10 as NODISCOUNT, discount_amount >= .10 as DISCOUNT, TRUE as A)

RESULT (FIRST(customer_id of NODISCOUNT) as customer_id

, FIRST(sales_date of DISCOUNT) as discount_date

, FIRST(discount_amount of DISCOUNT) as first_large_discount

, COUNT(* of A) as post_discount_visits

, COUNT(* of NODISCOUNT) AS pre_discount_visits)

  MODE(NONOVERLAPPING));

Determine the average, minimum, maximum number of days between visits and the count of the total number of visits by gender and frequent buyer level for the months of June and July 2008. Marketing would like this information sorted by gender and frequent buyer level.

select

  1. c.gender

, c.frequent_buyer_level

, sum(t.visit_count) total_visit_count

, avg(t.b_sales_date - t.a_sales_date) avg_days_between_visits

, min(t.b_sales_date - t.a_sales_date) min_days_between_visits

, max(t.b_sales_date - t.a_sales_date) max_days_between_visits from nPath (

ON (select s.* FROM prod.sales_fact as s INNER JOIN prod.product_dim as p ON (s.product_id = p.product_id)

where s.sales_date::date between to_date('2008-06-01', 'YYYY-MM-DD') and to_date('2008-08-01', 'YYYY-MM-DD'))

PARTITION BY customer_id

ORDER BY sales_date

MODE(OVERLAPPING)

PATTERN('A.B')

SYMBOLS(

true as A, true as B

)

RESULT(

FIRST(customer_id of A) as customer_id

, LAST(sales_date of B) as b_sales_date

, FIRST(sales_date of A) as a_sales_date

, COUNT(* of ANY(A)) as visit_count

)

) T

, prod.CUSTOMER_DIM C

WHERE T.CUSTOMER_ID = C.CUSTOMER_ID

GROUP BY C.GENDER, C.FREQUENT_BUYER_LEVEL, visit_count

  ORDER BY C.GENDER, C.FREQUENT_BUYER_LEVEL;

Detect customers that purchase the same category of items in three baskets in a row with a total value of those items greater than $150 just before Halloween (Use the month of October 2008).

Display the customer name, category name and three month basket value in the output.

select C.first_name || ' ' || C.last_name, A_all_product_categories, A_basket_value, B_basket_value, C_basket_value

from npath(

ON (select * from npath( ON ( select s.basket_id, p.product_category_name, s.sales_date, s.customer_id, SUM((p.retail_price - p.unit_cost - s.discount_amount) * s.sales_quantity)::DECIMAL(10,2) AS catValue from prod.sales_fact as s INNER JOIN prod.product_dim as p ON (s.product_id = p.product_id) WHERE s.sales_date::date between to_date('2008-10-01', 'YYYY-MM-DD') and to_date('2008-10-31', 'YYYY-MM-DD') group by customer_id, sales_date, basket_id, product_category_name)

PARTITION BY basket_id

ORDER BY product_category_name

MODE(NONOVERLAPPING)

PATTERN('A*')

SYMBOLS ( true as A)

RESULT(first(basket_id of A) as basket_id,

accumulate(product_category_name of A) as all_product_categories,

first(sales_date of A) as sales_date,

first(customer_id of A) as customer_id,

sum(catvalue of A) as basket_value)))

PARTITION BY customer_id

ORDER BY sales_date, basket_id

MODE(OVERLAPPING)

PATTERN('A.B.C')

SYMBOLS (true as A, true as B, true as C)

RESULT(first(customer_id of A) as customer_id,

first(all_product_categories of A) as A_all_product_categories,

first(all_product_categories of B) as B_all_product_categories,

first(all_product_categories of C) as C_all_product_categories,

first(basket_value of A) as A_basket_value,

first(basket_value of B) as B_basket_value,

first(basket_value of C) as C_basket_value)) T,

  1. prod.customer_dim C

WHERE T.customer_id = C.customer_id AND

A_all_product_categories like B_all_product_categories AND

C_all_product_categories like B_all_product_categories AND

  (A_basket_value + B_basket_value + C_basket_value > 150.00);

Detect stores with decreasing basket value and a decreasing number of customers between the starting month and the next month whilst there is an increase in both the basket value and number of customers (basket count) in the third month. In this case review Jan 2008 through Mar 2008 (1st Quarter). We are looking for any stores where there was a dip in customer baskets and basket values in the second month of the quarter. Display the Store Name, Basket Count and Basket Value for each month.

SELECT s.store_name, A_visits, B_visits, C_visits,

A_BasketValue, B_BasketValue, C_BasketValue

FROM npath(

ON

(select store_id, count(distinct basket_id) as visits_count,

AVG((p.retail_price - p.unit_cost - sf.discount_amount) * sf.sales_quantity)::DECIMAL(10,2) AS BasketValue

, date_trunc('month', sales_date) as month_Id

FROM prod.sales_fact sf, prod.product_dim p

WHERE sales_date::date between to_date('2008-01-01', 'YYYY-MM-DD') and to_date('2008-03-31', 'YYYY-MM-DD')

group by store_id, date_trunc('month', sales_date))

PARTITION BY store_id

ORDER BY month_Id

MODE(NONOVERLAPPING)

PATTERN( 'A.B.C')

SYMBOLS(true as A, true as B, true as C)

RESULT(FIRST (store_id of A) as store_id,

FIRST(visits_count of A) as A_visits,

FIRST(visits_count of B) as B_visits,

FIRST(visits_count of C) as C_visits,

FIRST(BasketValue of A) as A_BasketValue,

FIRST(BasketValue of B) as B_BasketValue,

  FIRST(BasketValue of C) as C_BasketValue)) T,
  1. prod.store_dim S

WHERE

A_visits > B_visits AND

B_visits < C_visits AND

A_BasketValue > B_BasketValue AND

B_BasketValue < C_BasketValue AND

  S.store_id = T.store_id;

Show all users who accessed the website, including their entry page, the web page they were really interested in and the full path.

select uid, interesting_page, first_page, visit_length as path_length

from npath(

ON large.msnbc_data

PARTITION BY uid

ORDER BY seqno

MODE( nonoverlapping )

PATTERN( 'FIRST.X*.INTERESTINGPAGES' )

SYMBOLS( pid NOT IN (17,16) as FIRST

, true as X

, pid IN (17,16) as INTERESTINGPAGES

)

RESULT( first(uid+0 of FIRST) as uid

, first(pid of INTERESTINGPAGES) as interesting_page

, first(pid of FIRST) as first_page

, count(* of X) as visit_length

)

  ) T;

Which category of pages occurs most frequently following the front page

create table rv_nextpagefrequency(partition key(nextpage)) as

select NextPage, count(*) as Frequency

from npath(

ON (select f.uid as userid, f.seqno as seqno, d.name as pagename

from large.msnbc_data f, msnbc_pages d

  where f.pid = d.pid )

PARTITION BY userid

ORDER BY seqno

MODE(nonoverlapping)

PATTERN( 'HOMEPAGE+.NONHOME' )

SYMBOLS( pagename ='frontpage' as HOMEPAGE

, pagename != 'frontpage' as NONHOME

)

RESULT( first(pagename of NONHOME) as NextPage )

)

GROUP BY 1;

What are the most common ways in which people landed up in Travel

create table rv_interestingpage_pathlength(partition key(uid)) as

select uid, d1.name as interestingpage, d2.name as firstpage, visit_length as pathlength

from npath(

ON large.msnbc_data

PARTITION BY uid

ORDER BY seqno

MODE( nonoverlapping )

PATTERN( 'FIRST.X*.INTERESTINGPAGES' )

SYMBOLS( pid NOT IN (17,16) as FIRST

, true as X

, pid IN (17,16) as INTERESTINGPAGES

)

RESULT( first(uid+0 of FIRST) as uid

, first(pid of INTERESTINGPAGES) as interestingpage

, first(pid of FIRST) as first

, count(* of X) as visit_length)

) T

, msnbc_pages d1

, msnbc_pages d2

WHERE T.interestingpage = d1.pid

AND T.first=d2.pid;

select count(*) from rv_interestingpage_pathlength;

Site map - for optimizing search engine efficiency, AB testing for website layout etc.

create table five_pa_step1(partition key(userid)) as

select userid, replace(rtrim(ltrim(PagePath,'['),']'),', ',',') as PagePath, 1::integer as Count

from npath(

ON (select f.uid as userid, f.seqno as seqno, d.name as pagename

from large.msnbc_data f, msnbc_pages d

where f.pid = d.pid

--limit 100

)

PARTITION BY userid

ORDER BY seqno

MODE(nonoverlapping)

PATTERN( 'A.A?.A?.A?.A?.X*$' )

SYMBOLS( true as A, true as X)

RESULT( first(userid+0 of A) as userid,

accumulate(pagename of A) as PagePath)

);

select count(*) from five_pa_step1;

-- Step 2: Then create the paths and subpaths using Path_Generator

drop table if exists five_pa_step2;

create table five_pa_step2(partition key(userid)) as

select *

from path_generator(

ON five_pa_step1

SEQ( 'pagepath' )

);

select count(*) from five_pa_step2;

-- Step 3: Then summarize paths so far using Path_Summarizer

drop table if exists five_pa_step3;

create table five_pa_step3(partition key(prefix)) as

select *

from path_summarizer(

ON five_pa_step2

PARTITION BY prefix

COUNT('count')

SEQ( 'sequence' )

PARTITIONNAMES('prefix')

PREFIX('prefix')

);

select count(*) from five_pa_step3;

  analyze five_pa_step3;

-- Step 4: Do the Path Starter

drop table if exists five_pa_step4;

create table five_pa_step4(partition key(partitioned)) as

select *

from path_start(

ON five_pa_step3

PARTITION BY parent

COUNT('count')

PARENT('parent')

PARTITIONNAMES('partitioned')

NODE('node')

);

select count(*) from five_pa_step4;

  analyze five_pa_step4;

Find user click-paths starting at pageid 50 and passing exclusively
through either pageid 80 or pages in category 9 or category 10. Find
the pageid of the last page in the path and count the number of times
page 80 was visited. Report the maximum count for each last page, and
sort the output by the latter. Restrict to paths containing at least 5
pages. Ignore pages in the sequence with category < 0.

SELECT last_pageid, MAX(count_page80) FROM nPath(

ON ( SELECT * FROM clicks WHERE category >= 0 )

PARTITION BY sessionid

ORDER BY ts

PATTERN ('A.(B|C)*')

MODE (OVERLAPPING)

SYMBOLS ( pageid = 50 AS A,

pageid = 80 AS B,

pageid <> 80 AND category IN (9,10) AS C )

RESULT ( LAST(pageid OF ANY(A,B,C)) AS last_pageid,

COUNT(* OF B) AS count_page80,

COUNT(* OF ANY(A,B,C)) AS count_any )

  ) WHERE count_any >= 5 GROUP BY last_pageid ORDER BY MAX(count_page80);
Find all pages that had began with fewer than 100 pageviews, then suddenly spiked to greater than 10000 pageviews for some period of time, then may or may not have relaxed back down

SELECT * FROM nPATH(

on <table_name>

PARTITION BY pagename,projectcode

ORDER BY statdate,hour

PATTERN('LOW+.HIGH+.LOW2*')

MODE(NONOVERLAPPING)

SYMBOLS(pageviews < 100 AS LOW, pageviews>10000 AS HIGH, pageviews< 100 as LOW2)

RESULT (FIRST(pagename||'' of low) as pagename

    ,  LAST(pageviews of LOW) as views_low

    ,  LAST(hour of LOW) as low_change_hour

    ,  FIRST(pageviews of HIGH) as views_high

    ,  LAST(hour of HIGH) as high_change_hour

    ,  FIRST(pageviews of LOW2) as views_low2)

);

Determine average lifespan and frequency when throwing a grenade in the game.

SELECT

path

, COUNT(1) AS freq

, ROUND(AVG(death_time - throw_time)::NUMERIC, 2) AS avg_alive_time

FROM nPath(

    ON (SELECT roundid, playername, roundtime, event, eventid

         FROM bf3_log WHERE event NOT IN (

'NetworkPerformance'

                                          , 'PlayerTick'

                                          , 'PlayerCommonMetric'

                                          , 'UnlockStatPoints'

                                          , 'DeserterReturns'

                                         )

                                    -- Safeguards against data inconsistency

                                    -- Remove when the data is clean

                                    AND roundtime IS NOT NULL

                                    AND event IS NOT NULL

                                    AND playername IS NOT NULL

                                    AND eventid IS NOT NULL

        )

    PARTITION BY roundid, playername

    ORDER BY eventid

    MODE (NONOVERLAPPING)

PATTERN('S.O*.G+.AG*.D')

    SYMBOLS(

        event = 'GrenadeThrow' AS G,

        event = 'Death' AS D,

        event = 'Spawn' AS S,

        event NOT IN ('GrenadeThrow', 'Death', 'Spawn') AS AG,

        event NOT IN ('GrenadeThrow', 'Death', 'Spawn') AS O

    )

    RESULT (

          FIRST(event OF AG) AS path

        , LAST(roundtime OF G) AS throw_time

        , FIRST(roundtime OF D) AS death_time

        , FIRST(roundid OF D) AS roundid

        , FIRST(playername OF D) AS playername

    )

)

GROUP BY 1

ORDER BY 2 DESC

Weapon Accuracy Over Time

SELECT *

FROM nPath (

ON ( SELECT … )

PARTITION BY nucleuspersonaid

ORDER BY roundstartdate

MODE (NONOVERLAPPING)

PATTERN ('(BEGINNER){3,}.(ADVANCED){3,}')

SYMBOLS ( hit_percentage >= coalesce(value_5, 0)::float AND hit_percentage < value_40::float AS BEGINNER

, hit_percentage >= value_80::float AND hit_percentage < value_100::float AS ADVANCED )

RESULT ( FIRST(nucleuspersonaid OF BEGINNER) AS nucleuspersonaid

, ACCUMULATE(hit_percentage OF ANY(BEGINNER, ADVANCED)) AS hit_percentage_list )

  ) n;

Abandon cart optimization – patterns in page path of checkout flow that result in abandonment. Discover previously unknown navigation path – where do users go right after abandoning cart? Flaw or opportunity for optimization.

select case when last_checkout_page='ordersubmitted' then false else true end as abandon_cart_flag,

next_page_after_checkout, checkout_path

from npath(

on pageview

partition by userid

order by eventtimestamp

pattern('CHECKOUT+.NEXTPAGE?')

mode(nonoverlapping)

symbols(pagecategory='checkout' as CHECKOUT, pagecategory!='checkout' as NEXTPAGE)

result(accumulate(url of CHECKOUT) as CHECKOUT_PATH,

last(url of CHECKOUT) as LAST_CHECKOUT_PAGE,

  first(url of NEXTPAGE) as NEXT_PAGE_AFTER_CHECKOUT );

Determine the top career transition path from software engineer to CEO

select job_transition_path, count(*) as count

from npath(

on linkedin

partition by userid

order by startdate

pattern('ENGR.OTHER*.CEO')

mode(nonoverlapping)

symbols(title ilike 'software eng%' as ENGR, lower(title)!='ceo' as OTHER, lower(title)='ceo' as CEO )

result( accumulate(title of ANY(ENGR,OTHER,CEO)) as job_transition_path )

)

  group by 1 order by 2 desc;

Business Question: Determine which properties contribute to user signups

Analytics Question: Events exist in multiple rows in the database, for each user •How do we attribute signup event back to user coming to the site many sessions/days ago

select dw_pfcid_entry

, dw_domainid_entry

, dw_refdomainid_entry,count(*) as number_of_sequences

, avg(number_of_pageviews_before_signup)

,avg(eventtimestamp_signup- eventtimestamp_entry)

from npath

(

on pageviews

partition by persistentid

order by eventtimestamp

mode (nonoverlapping) pattern ('^a+.s')

symbols ( dw_pfcid!=261 as a, -- non signup pfcdw_pfcid=261 as s -- first enter signup pfc)

result ( first(dw_pfcid of a) as dw_pfcid_entry,first(dw_domainid of a) as dw_domainid_entry

,first(dw_refdomainid of a) as dw_refdomainid_entry

,count(* of a) as number_of_pageviews_before_signup

,first(eventtimestamp of a) as eventtimestamp_entry

,first(eventtimestamp of s) as eventtimestamp_signup)) s

group by 1,2,3

  order by 4 desc limit 20;

Find customers who buy product P and buy it within 5 seconds

select * from npath

( on pageview

partition by userid

order by ts

mode (nonoverlapping)

pattern ('P.BUY')

symbols (url='p.html' as P

, url='checkout.html' and lag(ts,1)>=ts-interval '5 seconds' as BUY)

result (first(userid of P) as userid, first(ts of P) as p_ts, first(ts of BUY) as buy_ts)

  );

Min/max/avg/stdev of how many consecutive job postings do users view within one session?

select min(consecutive_jobview_count)

, avg(consecutive_jobview_count)

, stdev(consecutive_jobview_count)

from npath

(

on linkedin

partition by userid, sessionid

order by startdate

pattern('JOB+')

mode(nonoverlapping)

symbols(pagecategory='job' as JOB)

result( count(* of JOB) as consecutive_jobview_count )

  ) ;

Most popular sequence of 3 pages viewed before user upgrades from Business Plus to Executive

select prev3pages, count(*)

from npath(

on linkedin

partition by userid

order by startdate

pattern('A?.A?.A?.UPGRADE+.BPTOEXEC')

mode(nonoverlapping)

symbols(pagecategory!='upgradeworkflow' as A,

pagecategory='upgradeworkflow' as UPGRADE,

pagecategory='upgradeworkflow' and qs ilike '%upgrade=bp2exec%' as BPTOEXEC )

result( accumulate(URL of A) as prev3pages )

  ) group by 1 order by 2 desc limit 20;

For users who have been inactive for over 2 month and then come back to site, what is the length of inactivity, first page they land on, and the referral url?

select *

from npath

(

on linkedin

partition by userid

order by startdate

pattern('A.B')

mode(nonoverlapping)

symbols( true as A

, lag(ts,1)<=ts-interval '2 month' as B )

result( first(userid of A) as userid

, first(ts of A) as prev_ts

, first(ts of B) as return_ts

, first(url of B) as return_url

, first(refurl of B) as return_refurl )

  ) ;

Most popular sequence of 3 web pageviews (proxy log) before user downloads a malware (firewall log)

select malware_url, prev3_pageview_path, count(*) as count

from npath(

on logs

partition by userid

order by event_timestamp

pattern('A?.A?.A?.MALWARE')

mode(nonoverlapping)

symbols( log_type='proxy' as A, log_type='firewall' and event_type='malware' as MALWARE )

result( accumulate(URL of A) as prev3_pageview_path, first(URL of MALWARE) as malware_url )

)

group by 1,2

  order by 3 desc limit 20;


Find users who had more than 3 consecutive wrong password attempts. Return userid, # attempts, time of first and last attempt.

select * from npath(

on logs

partition by userid

order by event_timestamp

pattern('FAIL+')

mode(nonoverlapping)

symbols(log_type='vpn' and event_type='login' and event_status='wrong password' as FAIL)

result( first(userid of FAIL) as userid, count(* of FAIL) as fail_count,

first(event_timestamp of FAIL) as first_fail_timestamp,

last(event_timestamp of FAIL) as last_fail_timestamp

))

where fail_count>3

  order by fail_count desc;

Find users who access VPN after having no activity in any system for at least 1 month, find out what is the first web page they go to.

select * from npath

(

on logs

partition by userid

order by event_timestamp

pattern('A.RETURN.X*.WEB')

mode(nonoverlapping)

symbols(true as A

, lag(event_timestamp,1)<=event_timestamp-interval '1 month' as RETURN

, log_type!='proxy' as X, log_type='proxy' as WEB )

result( first(userid of A) as userid

, first(url of WEB) as first_webpage)

  );

Find user click-paths starting at page 50 and passing exclusively through either page 80 or pages in category 9 or category 10. Find the pageid of the last page in the path and count the number of times page 80 was visited. Report the maximum count for each last page, and sort the output by the latter. Restrict to paths containing at least 5 pages. Ignore pages in the sequence with category < 0.

SELECT last_page, max(B_count) as max_count

FROM nPath(

ON (select * from clicks where category >= 0)

PARTITION BY user_id

ORDER BY timestamp

MODE(OVERLAPPING)

PATTERN(‘A.(B|C)*’)

SYMBOLS(

pageid = 50 as A,

pageid = 80 as B,

pageid <> 80 and category in (9,10) as C)

RESULT(

count(* of ANY(A,B,C)) as count,

last(page_id of ANY(A,B,C)) as last_page,

count(* of ANY(B)) as B_count)

)

WHERE count >= 5

GROUP BY last_page

  ORDER By max_count;

Find all orders that were entered, but were cancelled before being fulfilled.

A log of events in the trading system.

Event Codes:

ECN_EVENTNUM = 1 "Got Order"

ECN_EVENTNUM = 4 "Preference Filled"

ECN_EVENTNUM = 9 "Internal Fill"

ECN_EVENTNUM = 8 "Cancel Order"

SELECT

order_number, symbol, actiondate

FROM nPath(

ON transactions

PARTITION BY order_number

ORDER BY actiondate

MODE(OVERLAPPING)

PATTERN(’O.U*.C‘)

SYMBOLS(

(ecn_eventnum = 1) AS O,

(ecn_eventnum <> 4 and ecn_eventnum <> 9 ) AS U,

(ecn_eventnum = 8) AS C)

RESULT(

first(ordnum of O) as order_number,

first(symbol of O) as symbol,

  first(actiondate of O) as actiondate));

For each new day's data (of mostly impressions, few clicks, very few actions), look back 28 days of history and match the click to the last impression shown to that user. Match the action to the last click done by that user. If there's no click, then match the action to the last impression shown by the user. If there is no matching impression for a click or an action, then simply skip over to the next click/action to attribute.

create table Attributed(partition key(client_id)) as

select

(case when impression_days = '[]' then null else ltrim(rtrim(impression_days,']'),'[') end)::date as impression_day,

(case when click_days = '[]' then null else ltrim(rtrim(click_days,']'),'[') end)::date as click_day,

(case when action_days = '[]' then null else ltrim(rtrim(action_days,']'),'[') end)::date as action_day,

id, mp_cookie, creative_id, client_id, campaign_id, vendor_id, action_code,

case when extended_attribute = 'null' then null else extended_attribute end as extended_attribute

from npath(

on (

select id, coalesce(mp_cookie,-1) as mp_cookie,

coalesce(creative_id,-1) as creative_id,

coalesce(client_id,-1) as client_id,

coalesce(campaign_id,-1) as campaign_id, coalesce(vendor_id,-1) as vendor_id,

action_code, log_day, coalesce(extended_attribute,'null') as extended_attribute

from events

where (action_code = 1 or action_code = 2 or action_code = 3) and log_day is not null

)

partition by client_id,creative_id, campaign_id, vendor_id, mp_cookie

order by log_day,action_code

mode(nonoverlapping)

pattern('(I.C?.A?)|(C.A?)|(A)')

symbols( action_code=1 as I,

action_code=2 as C,

action_code=3 as A

)

result( accumulate(log_day of I) as impression_days,

accumulate(log_day of C) as click_days,

accumulate(log_day of A) as action_days,

first(id of ANY(I,C,A)) as id,

first(mp_cookie of ANY(I,C,A)) as mp_cookie,

first(creative_id of ANY(I,C,A)) as creative_id,

first(client_id of ANY(I,C,A)) as client_id,

first(campaign_id of ANY(I,C,A)) as campaign_id,

first(vendor_id of ANY(I,C,A)) as vendor_id,

first(action_code of ANY(I,C,A)) as action_code,

first(extended_attribute of ANY(I,C,A)) as extended_attribute

)

  ) T;

Are customers not finding the right product? is the price too high?

SELECT *

FROM nPath

ON (hit_data)

PARTITION BY visid_low, visid_high, visit_num

ORDER BY date_time

MODE(NONOVERLAPPING)

PATTERN('CHECKOUT_INTENT.(NOTTHANKYOU*.SEARCH.NOTTHANKYOU*){2,}.NOTTHANKYOU*$')

SYMBOLS (

pagename ~ 'Search Results' and search_terms != lag(search_terms, 1, ''::varchar) as SEARCH,

NOT (pagename ~ 'Checkout - Thank You' OR pagename ~ 'Search Results' ) AS NOTTHANKYOU,

(pagename ~ 'Checkout Entry'

OR pagename ~ 'Checkout - Payment Info'

OR pagename ~ 'Checkout - Enter Password'

OR pagename ~ 'Checkout - Billing Contact'

OR pagename ~ 'Checkout - Add.Edit Shipping Address'

  ) AS CHECKOUT_INTENT;

Determine when/what other products are purchased when a camera is purchased.

select accessory_time - anchor_time as diff, *

from nPath (

on (select * from transaction_data )

partition by customer_id

order by post_date desc

SYMBOLS(

category_name='Digital Cameras' as anchorItem,

category_name='External Hard Drive' as externalDisk,

category_name='Digital Picture Frames' as frame,

category_name='Printers' as printer,

category_name='Camera & Camcorder Accessories' as otherAccessories,

true as X

)

PATTERN('(frame|printer|externalDisk|otherAccessories).X*.anchorItem')

mode(overlapping)

result(

first(post_date of anchorItem) as anchor_time,

first_notnull(post_date of ANY(externalDisk,frame,printer,otherAccessories)) as accessory_time,

first(customer_id of anchorItem) as customer_id,

first_notnull(category_name of ANY(externalDisk,printer,frame,otherAccessories)) as category_name

)

)

  order by customer_id;


What are the most populate pages once a customer lands on the home page?

SELECT NextPage, COUNT(*) as Frequency

FROM NPATH(

ON ( SELECT f.uid as userid, f.seqno as seqno, d.name as pagename

FROM msnbc_data f, msnbc_pages d

WHERE f.pid = d.pid )

PARTITION BY userid

ORDER BY seqno

MODE(nonoverlapping)

PATTERN( '^HOMEPAGE.ANYPAGE' ) --supports a regular expression over many rows of data

SYMBOLS( pagename ='frontpage' as HOMEPAGE

, true as ANYPAGE )

RESULT( FIRST(pagename of ANYPAGE) as NextPage ) --supports computing a wide variety of metrics on each match of the pattern)

  GROUP BY 1;

Determine when/what other products are purchased when a camera is purchased.

SELECT firstpage, interestingpage, COUNT(*) as page views, AVG( path_length) as avg_pathlength

FROM NPATH(

ON ( SELECT f.uid as userid, f.seqno as seqno, d.name as pagename

FROM msnbc_data f, msnbc_pages d

WHERE f.pid = d.pid )

PARTITION BY userid

ORDER BY seqno

MODE( nonoverlapping )

PATTERN( 'FIRSTPAGE.X*.INTERESTINGPAGES' )

SYMBOLS( pagename NOT IN (‘travel’,’bbs’) as FIRSTPAGE

, true as X

, pagename IN (‘travel’,’bbs’) as INTERESTINGPAGES )

RESULT( FIRST(userid of FIRSTPAGE) as uid

, FIRST(pagename of INTERESTINGPAGES) as interestingpage

, FIRST(pagename of FIRSTPAGE) as first

, COUNT(* of X) as path_length )) T

  GROUP BY 1,2

In table loan_transactions, find sequence of credit line increase request (CLI) followed eventually by default (2 or more consecutive missedpayment). Find the customer and the latency from CLI to default.

SELECT *, default_ts - cli_ts as cli_to_default_latency

FROM npath (

ON loan_transactions

partition by customer

order by eventtimestamp

mode(nonoverlapping)

pattern('CLI.X*.DEFT{2}')

symbols(eventtype='CLI' as CLI

, true as X

, eventtype='missedpayment' as DEFT )

result( first(customer of CLI) as customer

, first(eventtimestamp of CLI) as CLI_ts

  , last(eventtimestamp of DEFT) as default_ts ));

In table loan_transactions, for each incidence of default (2 or more consecutive missedpayment), return the customer and the amount of the very last payment made.

SELECT *

FROM npath ( ON loan_transactions

partition by customer

order by eventtimestamp

mode(nonoverlapping)

pattern('PAYMENT.DEFT{2}')

symbols(eventtype='payment' as PAYMENT,

eventtype='missedpayment' as DEFT )

result( first(customer of PAYMENT) as customer,

first(amount of PAYMENT) as last_payment_amount )

  );

In table pageviews, for all incidence of visiting the "mortgage apply form" page, return the customer, the sequence of the last 3 urls visited leading up to that mortgage apply form pageview, as well as latency from first and last of those 3 pageviews to the mortgage apply form pageview event.

select *, apply_eventts-pvfirst_eventts as latency_from_1st, apply_eventts-pvlast_eventts as latency_from_last

from npath (

on pageviews

partition by customer

order by eventtimestamp

mode(nonoverlapping)

pattern('PAGEVIEW?.PAGEVIEW?.PAGEVIEW.APPLY')

symbols(url!='mortgage apply form' as PAGEVIEW, url='mortgage apply form' as APPLY )

result( first(customer of PAGEVIEW) as customer

, accumulate(url of PAGEVIEW) as pageview_sequence

, first(eventtimestamp of PAGEVIEW) as pvfirst_eventts

, last(eventtimestamp of PAGEVIEW) as pvlast_eventts

  , first(eventtimestamp of APPLY) as apply_eventts ));


In table monthly_balances, find instances where customers show at least 2 consecutive month to (prior) month balance decline of at least 10%, immediately followed by a balance increase of at least 30%. Return customer name and sequence of balances covering the declines and increase months.

select *

from npath (

on monthly_balances

partition by customer

order by eventdate

mode(nonoverlapping)

pattern('PREV.DOWN.DOWN+.UP')

symbols(true as PREV, lag(balance,1)>= (balance/0.9)::int as DOWN, lag(balance,1)<= (balance/1.3)::int as UP )

result( first(customer of DOWN) as customer, accumulate(balance of ANY(PREV,DOWN,UP)) as balance_sequence )

  );

In table savings_transactions, find incidence of at least three consecutive small deposits of less than $30 that are within an hour of the prior transaction, followed by a large debit of at least $150 that is within 2 days of the prior transaction. Return the customer, the count and sum of the small deposits, start/end time of the small deposits, amount and time of the large debit.

select *

from npath (

on savings_transactions

partition by customer

order by eventtimestamp

mode(nonoverlapping)

pattern('INITIALSMALLDEP.SMALLDEP.SMALLDEP+.BIGDEBIT')

symbols(amount>=0 and amount<30 as INITIALSMALLDEP

, amount>=0 and amount<30 and lag(eventtimestamp,1)>=eventtimestamp-interval '1 hour' as SMALLDEP

, amount<=-150 and lag(eventtimestamp,1)>=eventtimestamp-interval '48 hours' as BIGDEBIT

)

result(first(customer of SMALLDEP) as customer

, count(* of ANY(INITIALSMALLDEP,SMALLDEP)) as smalldeposit_count

, sum(amount of ANY(INITIALSMALLDEP,SMALLDEP)) as smalldeposit_total

, first(eventtimestamp of INITIALSMALLDEP) as smalldeposit_starttime

, last(eventtimestamp of SMALLDEP) as smalldeposit_endtime

, first(eventtimestamp of BIGDEBIT) as bigdebit_time

, first(amount of BIGDEBIT) as bigdebit_amount )

  );

In table credit_transactions, identify incidence of at least 2 autorepair category spend within 2 weeks of the prior autorepair spend, immediately followed by an autodealer spend of at least $500 that is within 2 weeks of the prior autorepair spend. Return the customer, the number and total autorepair spend, the date of first and last autorepair spend, and date and amount of the autodealer spend.

select *

from npath (

on credit_transactions

partition by customer

order by eventdate

mode(nonoverlapping)

pattern('PREV.REPAIR.REPAIR+.DEALER')

symbols(merchantcategory='autorepair' as PREV

, merchantcategory='autorepair' and lag(eventdate,1)>=(eventdate - interval '2 weeks')::date as REPAIR

, merchantcategory='autodealer' and amount>=500 and lag(eventdate,1)>=(eventdate - interval '2 weeks')::date as DEALER)

result(first(customer of REPAIR) as customer

, count(* of ANY(PREV,REPAIR)) as repair_count

, sum(amount of ANY(PREV,REPAIR)) as repair_total

, first(eventdate of PREV) as repair_startdate

, last(eventdate of REPAIR) as repair_enddate

, first(eventdate of DEALER) as dealer_date

, first(amount of DEALER) as dealer_amt )

  );

In table savings_transactions, identify pairs of adjacent deposits (ignore debits) whose amounts are over $500 and within 2% of each other. Return the customer, the two amounts, and the time lag between the pair of deposits.

select customer, a_amount, b_amount, b_time-a_time as time_lag

from npath ( on (select * from savings_transactions where amount>=0)

partition by customer

order by eventtimestamp

mode(nonoverlapping)

pattern('A.B')

symbols(amount>=500 as A, amount>=500 and

lag(amount,1)>=(amount*0.98)::int and lag(amount,1)<=(amount*1.02)::int as B)

result(first(customer of A) as customer,

first(eventtimestamp of A) as a_time,

first(eventtimestamp of B) as b_time,

first(amount of A) as a_amount,

first(amount of B) as b_amount)

  );

Find the top 100 search terms. From the top 100 search terms, find the top 5 Health Centers most visited. Sort the output by the top number of visitors per Search Word and group by the Search Word.

CREATE FACT TABLE npath2_1

(partition key (visitor_id)) AS

SELECT *

FROM nPath(

ON (

SELECT pv.visitor_id, pv.visit_num, pv.page_view_num, pv.search_terms_txt, w.health_channel_nm, w.page_nm

FROM Global_Page_View pv

, Web_Page w

WHERE pv.Web_Resource_ID = w.Web_Resource_ID

AND pv.Page_Event_cd = 0

AND w.Report_Suite_cd = 1

AND pv.Page_view_dtm BETWEEN '2010-03-01 00:00:00' AND '2010-03-31 23:59:59'

)

PARTITION BY visit_num, visitor_id

ORDER BY page_view_num

MODE (NONOVERLAPPING)

PATTERN ( 'SEARCH.OTHER*.HCENTER' )

SYMBOLS ( search_terms_txt IS NOT NULL AND search_terms_txt != 'no search term' AS SEARCH

, health_channel_nm IS NOT NULL AND health_channel_nm != 'ntc' AS HCENTER

, NOT ( ( search_terms_txt IS NOT NULL AND search_terms_txt != 'no search term' )

OR ( health_channel_nm IS NOT NULL AND health_channel_nm != 'ntc' )

) AS OTHER

)

RESULT ( FIRST(visitor_id + 0 OF SEARCH) AS visitor_id

, FIRST(visit_num OF SEARCH) AS visit_num

, FIRST(health_channel_nm OF HCENTER) AS health_channel_nm

, FIRST(page_nm OF HCENTER) AS page_nm

, FIRST(search_terms_txt OF SEARCH) as search_terms_txt )

  ) n ;

-- Using this table we get the list of search words and find the top 100 search terms.

-- For each search term, top 5 health centers.

SELECT * FROM (

SELECT iv2.search_rank

, iv1.search_word

, Health_Center

, RANK() OVER( PARTITION BY iv1.search_word ORDER BY num_visitors DESC) AS hc_rank

, num_visitors

, num_visits

, num_pageviews

FROM

(SELECT health_channel_nm AS Health_Center

, regexp_split_to_table ( search_terms_txt, ' ' ) AS search_word

, COUNT(DISTINCT visitor_id) as num_visitors

, COUNT(DISTINCT visitor_id, visit_num) as num_visits

, COUNT(*) as num_pageviews

FROM npath2_1

GROUP BY 1,2

) iv1,

(SELECT search_word

, ROW_NUMBER() OVER( ORDER BY count(*) DESC ) AS search_rank

FROM (

SELECT regexp_split_to_table ( search_terms_txt, ' ' ) AS search_word FROM npath2_1

) search_words

WHERE search_word NOT IN ('in','of','and','on','the','for','to','how','what','from','when','a','is','i','with','','you','my', 'after','can')

-- GROUP BY

ORDER BY count(*) DESC

LIMIT 100) iv2

WHERE iv1.search_word = iv2.search_word) iv3

WHERE iv3.hc_rank <= 5

ORDER BY search_rank, hc_rank

  ;

A custom Advertiser level report that is intended to assign credit (count) users based on how they progress through various stages of engagement in the user lifecycle toward conversion events. This query is a modified funnel report looking to count users who have specified "top funnel" events and "bottom funnel" events. A users' sequential progress is traced and not just the strict series of events.

Further, for this report, the client is looking to assign "credit" to different parts of the campaign depending on how someone entered the funnel.

SELECT coalesce( col1_event, 0) as Last_Event

, coalesce(col2_event, 0) as Homepage_Click

, col3_event as Conversion_Event

, count(*) as Number_of_Paths

FROM nPath(

ON ( select elo_id, elo_user_id, elo_action_id,elo_date

from rpt_event_log

)

PARTITION BY elo_user_id

ORDER BY elo_date

MODE(overlapping)

PATTERN( '(Col2.X*.Col1.X*.Col3)|(Col1.X*.Col1.Col3)|(Col1.X*.Col2+.X*.Col3)' )

SYMBOLS( elo_action_id in (900,906) as Col1

, elo_action_id in (904) as Col2

, elo_action_id in (558) as Col3

, elo_action_id NOT in (900, 906, 904, 558) as X

)

RESULT( last( elo_action_id of Col1 ) as Col1_event

, first( elo_action_id of Col2) as Col2_event

, last( elo_action_id of Col3) as Col3_event

, last( elo_user_id of X) as userid_session

)

) T

  GROUP BY 1,2,3;

Show me the last 3 impressions viewed by any unique user who saw more than 10 impressions prior to conversion. Show me the most common n impressions viewed by a unique user who saw exactly 3 impressions

select iv.user_id as user_id

, iv.conv_date - iv.last_impr_date as window

, iv.prior_impr_count as prior_impr_count

, iv.path as path

from npath (

on ( SELECT rec_date, user_id, page_id, ad_id, 'I'::Char(1) AS source

FROM net_impression_col

WHERE rec_date between '2010-09-01' and '2010-09-01'::date + interval '30 day'

UNION ALL

SELECT rec_date, user_id, page_id, ad_id, 'A'::Char(1) AS source

FROM net_activity_col

WHERE activity_sub_type = 'order'

AND rec_date between '2010-09-01' and '2010-09-01'::date + interval '30 day'

)

partition by user_id

order by rec_date

mode(nonoverlapping)

pattern('impression+.conversion')

symbols(

source = 'I' as impression,

source = 'A' as conversion

)

result (

first(user_id OF conversion) AS user_id,

first(rec_date OF conversion) AS conv_date,

last(rec_date OF impression) AS last_impr_date,

count(* OF impression) AS prior_impr_count,

accumulate(page_id || ':' || ad_id of impression) as path

)

  ) iv;

How do we attribute a purchase back to a specific ad within a 30 day period?

SELECT product, user, time_of_purchase - time_of_ad

FROM nPath(

ON events

PARTITION BY product_id, user_id

ORDER BY time

MODE (nonoverlapping)

PATTERN( ‘ADSHOWN.INBETWEEN*.PURCHASE’ )

SYMBOLS( event_type = ‘impression’ AS ADSHOWN

, true AS INBETWEEN

, event_type = ‘buy’ AS PURCHASE)

RESULT( first(time of ADSHOWN) as time_of_ad

, first(time of PURCHASE) as time_of_purchase

, first(product_id of PURCHASE) as product

, first(user_id of PURCHASE) as user ))

  WHERE time_of_purchase – time_of_ad < ‘30 days’;

Find all instances where an operator logs in, then logs out after working for more than 3 hours, then logs back in within a minute. for all such incidents, return the crew member's name and the timestamp of these three events

select *

from npath(

on (select parent_node,("Time"::varchar(8) || ' '||substr("Time",9,6))::timestamp as eventtimestamp

, "CrewName" as crewname

, "RegId" as regid from XMLParser_v1 (on stld_raw parent('TRX_OperLogin,TRX_OperLogout')

children('Time,CrewName,RegId')

)

where "CrewName"!='' )

partition by CrewName

order by eventtimestamp

mode(nonoverlapping)

pattern('A.B.C')

symbols(parent_node='TRX_OperLogin' as A

, parent_node='TRX_OperLogout' and lag(eventtimestamp,1)<=eventtimestamp - interval '180 minute' as B

, parent_node='TRX_OperLogin' and lag(eventtimestamp,1)>=eventtimestamp - interval '1 minute'as C)

result(first(crewname of A) as CrewName

, accumulate(eventtimestamp of ANY(A,B,C)) as login_out_in_timesequence)

  );

Find all customers with 3+ consecutive decrease of 10%+ in monthly transaction value.
Target with retention offers; understand behavioral patterns of dis-engaging customers; study correlation with churn.

select *

from npath( ON customer_transaction_monthly_summary

partition by (customerid)

  order by transaction_month

pattern('A.B{3,}')

symbols(true as A, lag(1, amount)>=1.11*amount as B)

  );