WordCount/CharCount in PySpark


WORDCOUNT in PySpark


data = sc.textFile("file://Users/furqan/data/wordcount.txt")

data.collect()

[u'hello good morning hello good morning']

data_split = data.flatMap(lambda x:x.split())

data_split.collect()

[u'hello', u'good', u'morning', u'hello', u'good', u'morning']

data_map = data_split.map(lambda x:(x,1))

data_map.collect()

[(u'hello', 1), (u'good', 1), (u'morning', 1), (u'hello', 1), (u'good', 1), (u'morning', 1)]


data_reduce = data_map.reduceByKey(lambda x,y:x+y)

data_reduce.collect()

[(u'good', 2), (u'hello', 2), (u'morning', 2)]


for (word, count) in data_reduce.collect():
...     print("%s: %i" % (word, count))

good: 2
hello: 2
morning: 2

CHAR_COUNT:

char_counts = data_reduce.flatMap(lambda each: each[0]).map(lambda char: char).map(lambda c: (c, 1)).reduceByKey(lambda v1, v2: v1 + v2)


char_counts.collect()

[(u'i', 1), (u'm', 1), (u'e', 1), (u'o', 4), (u'g', 2), (u'h', 1), (u'r', 1), (u'n', 2), (u'd', 1), (u'l', 2)]


for (word, count) in char_counts.collect():
...      print("%s: %i" % (word, count))


i: 1
m: 1
e: 1
o: 4
g: 2
h: 1
r: 1
n: 2
d: 1
l: 2



No comments:

Post a Comment

Pages