WordCount/CharCount in PySpark

WORDCOUNT in PySpark

data = sc.textFile("file://Users/furqan/data/wordcount.txt")

data.collect()

[u'hello good morning hello good morning']

data_split = data.flatMap(lambda x:x.split())

data_split.collect()

[u'hello', u'good', u'morning', u'hello', u'good', u'morning']

data_map = data_split.map(lambda x:(x,1))

data_map.collect()

[(u'hello', 1), (u'good', 1), (u'morning', 1), (u'hello', 1), (u'good', 1), (u'morning', 1)]

data_reduce = data_map.reduceByKey(lambda x,y:x+y)

data_reduce.collect()

[(u'good', 2), (u'hello', 2), (u'morning', 2)]

for (word, count) in data_reduce.collect():

...     print("%s: %i" % (word, count))

good: 2

hello: 2

morning: 2

CHAR_COUNT:

char_counts = data_reduce.flatMap(lambda each: each[0]).map(lambda char: char).map(lambda c: (c, 1)).reduceByKey(lambda v1, v2: v1 + v2)

char_counts.collect()

[(u'i', 1), (u'm', 1), (u'e', 1), (u'o', 4), (u'g', 2), (u'h', 1), (u'r', 1), (u'n', 2), (u'd', 1), (u'l', 2)]

for (word, count) in char_counts.collect():

...      print("%s: %i" % (word, count))

i: 1

m: 1

e: 1

o: 4

g: 2

h: 1

r: 1

n: 2

d: 1

l: 2

Data Engineering