AppCenter: How to Build a Word Cloud

Learn Data Science
Teradata Employee

I have been asked recently how to build a word cloud in Aster using AppCenter.  A word cloud is basically a visualization that shows a bag of words and their frequencies of occurrence.  The more frequently a word is used in your bag of words the bigger the bolder the font of that word shows up in the report.  It is a relatively simple report.


Word clouds are simple and convey meaning in graphical context that could be presented in a simple ranking report.  Where word clouds get interesting is not necessarily by the COUNT of the word but in the count of words that have purpose or meaning.  When building word clouds it is important to harmonize words by using stemming and other approaches.  It is also important to remove words that are meaningless:  such as stop words:  Words like THE, IS, AND, OR... etc.  It is all very subjective however.


Below is a sampling of the source code to generate a word cloud including the visualizations.  I have also attached the complete AppCenter app for your to add to your collection.


Below is an example of a word cloud:



wc1.JPG




/* DATA PREPARATION FOR THE VISUALIZATION STATEMENTS */

--BASE TABLE - DDL STATEMENT

CREATE FACT TABLE

public.hotel_reviews  (

city       varchar NULL,

hotelid    varchar NULL,

reviewdate timestamp NULL,

subject    varchar NULL,

review     varchar NULL,

reviewid   bigint NULL

)

DISTRIBUTE BY HASH (reviewid)

--STATEMENT:  hotel_reviews_specific  (Concatenate Subject and Reviews)

DROP TABLE IF EXISTS hotel_reviews_specific;

CREATE TABLE hotel_reviews_specific

DISTRIBUTE BY HASH (reviewid)

AS

SELECT h.reviewid,

h.subject || ' ' || h.review AS review,

h.hotelid,

h.reviewdate,

h.city

FROM hotel_reviews h

WHERE h.hotelid = '${hotelid}';

ANALYZE hotel_reviews_specific;

SELECT COUNT(*) FROM hotel_reviews_specific;

--STATEMENT:  hotel_reviews_specific_sentiment  (Run a sentiment Analysis)

DROP TABLE IF EXISTS hotel_reviews_specific_sentiment;

CREATE TABLE hotel_reviews_specific_sentiment

DISTRIBUTE BY HASH (reviewid)

AS

SELECT *

FROM ExtractSentiment (

  ON hotel_reviews_specific

  TEXT_COLUMN('review')

  ACCUMULATE ('reviewid', 'hotelid')

) es;

ANALYZE hotel_reviews_specific_sentiment;

SELECT COUNT(*) FROM hotel_reviews_specific_sentiment;

--STATEMENT:  hotel_reviews_specific_all

DROP TABLE IF EXISTS hotel_reviews_specific_all;

CREATE TABLE hotel_reviews_specific_all

DISTRIBUTE BY HASH (reviewid)

AS

SELECT a.city,

a.hotelid,

a.reviewdate,

a.reviewid,

a.reviewdate AS reviewtimestamp,

a.review,

s.out_polarity,

s.out_strength,

s.out_sentiment_words,

(CASE WHEN s.out_polarity = 'NEG' THEN (-1 * s.out_strength) ELSE s.out_strength END::INTEGER + 3) AS sentimentscore

FROM hotel_reviews_specific a

INNER JOIN

hotel_reviews_specific_sentiment s

ON a.reviewid = s.reviewid;

ANALYZE hotel_reviews_specific_all;

SELECT COUNT(*) FROM hotel_reviews_specific_all;

--STATEMENT:  hotel_reviews_specific_tfidf  (Run a Term Frequency, Inverse Document Frequency Analysis)

DROP TABLE IF EXISTS hotel_reviews_specific_tfidf;

CREATE TABLE hotel_reviews_specific_tfidf

DISTRIBUTE BY HASH (reviewid)

AS

SELECT tfidf.docid AS reviewid,

REPLACE(tfidf.term, '&QUOT', '') AS term,

tfidf.tf,

tfidf.idf,

tfidf.tf_idf

FROM TF_IDF (

  ON TF (

      ON (

        SELECT tp.reviewid AS docid,

        UPPER(tp.token) AS term,

        tp.frequency AS count

        FROM text_parser (

            ON (

              SELECT a.reviewid,

              REGEXP_REPLACE(a.review, '[\\.\\?,]', ' ', 'g') AS reviewtext

              FROM hotel_reviews_specific_all a

            )

            TEXT_COLUMN('reviewtext')

            CASE_INSENSITIVE('TRUE')

            PUNCTUATION('[\\\[.,?\!:;~()\\\]"]+')

            -- DELIMITER(' ')

            -- STEMMING ('true')

            REMOVE_STOP_WORDS ('true')

        ) tp

      )

      PARTITION BY docid

  ) AS TF

  PARTITION BY term

  ON (

      SELECT COUNT(DISTINCT a.reviewid) AS total_no_of_docs

      FROM hotel_reviews_specific_all a

  ) AS doccount

  DIMENSION

) tfidf;

--STATEMENT:  hotel_reviews_specific_tfidf_terms  (Consolidate Sentiment and TF IDF Analysis)

DROP TABLE IF EXISTS hotel_reviews_specific_tfidf_terms;

CREATE TABLE hotel_reviews_specific_tfidf_terms

DISTRIBUTE BY REPLICATION

AS

SELECT iv.hotelid,

iv.term,

iv.sentiment,

iv.tf_idf,

iv.rn

FROM (

  SELECT b.hotelid,

  a.term,

  CASE WHEN b.sentimentscore <= 2 THEN 'UNHAPPY'

      WHEN b.sentimentscore >= 4 THEN 'HAPPY'

      END AS sentiment,

  SUM(a.tf_idf) AS tf_idf,

  ROW_NUMBER() OVER (PARTITION BY CASE WHEN b.sentimentscore <= 2 THEN 'UNHAPPY' WHEN b.sentimentscore >= 4 THEN 'HAPPY' END ORDER BY SUM(a.tf_idf) DESC) AS rn

  FROM hotel_reviews_specific_tfidf a

  INNER JOIN

  hotel_reviews_specific_all b

  ON a.reviewid = b.reviewid

  WHERE LENGTH(a.term) >= 3

  AND b.sentimentscore IN (1, 2, 4, 5)

  GROUP BY 1, 2, 3

) iv

WHERE iv.rn <= 100;

ANALYZE hotel_reviews_specific_tfidf_terms;

SELECT COUNT(*) FROM hotel_reviews_specific_tfidf_terms;

/* VISUALIZATION CODE FOR TF_IDF and WORD CLOUDS */

insert into app_center_visualizations (json)

select json from Visualizer (ON (SELECT * FROM hotel_reviews_specific_tfidf_terms a WHERE a.sentiment = 'HAPPY') PARTITION BY 1

asterfunction('tfidf')

title('POSITIVE TFIDF TERMS')

viztype('wordcloud'));

insert into app_center_visualizations (json)

select json from Visualizer (ON (SELECT * FROM hotel_reviews_specific_tfidf_terms a WHERE a.sentiment = 'UNHAPPY') PARTITION BY 1

asterfunction('tfidf')

title('NEGATIVE TFIDF TERMS')

viztype('wordcloud'));

insert into app_center_visualizations (json) values (

'{

  "db_table_name":"hotel_reviews_specific",

  "vizType":"table",

  "version":"1.0",

  "title":"HOTEL DATA"

}');