-- Hive queries for Word Count

drop table if exists doc;

-- 1) create table to load whole file
create table doc(
text string
) row format delimited fields terminated by '\n' stored as textfile;

--2) loads plain text file
--if file is .csv then in replace '\n' by ',' in step no 1 (creation of doc table)
load data local inpath '/home/trendwise/Documents/sentiment/doc_data/wikipedia' overwrite into table doc;

-- Trick-1
-- 3) wordCount in single line
SELECT word, COUNT(*) FROM doc LATERAL VIEW explode(split(text, ' ')) lTable as word GROUP BY word;  


-- Trick-2
-- temporary table to hold words...
CREATE TABLE words (word STRING);

-- make file & save as splitter.py
#!/usr/bin/python
import sys
for line in sys.stdin:
 for word in line.split():
   print word

-- add  file splitter in hive prompt
add file /home/trendwise/Documents/sentiment/query/document/hive/splitter.py;

INSERT OVERWRITE TABLE words 
  SELECT TRANSFORM(text) 
    USING 'python splitter.py' 
    AS word
FROM doc;