WORDCOUNT in PySpark
data = sc.textFile("file://Users/furqan/data/wordcount.txt")
data.collect()
[u'hello good morning hello good morning']
data_split = data.flatMap(lambda x:x.split())
data_split.collect()
[u'hello', u'good', u'morning', u'hello', u'good', u'morning']
data_map = data_split.map(lambda x:(x,1))
data_map.collect()
[(u'hello', 1), (u'good', 1), (u'morning', 1), (u'hello', 1), (u'good', 1), (u'morning', 1)]
data_reduce = data_map.reduceByKey(lambda x,y:x+y)
data_reduce.collect()
[(u'good', 2), (u'hello', 2), (u'morning', 2)]
for (word, count) in data_reduce.collect():
... print("%s: %i" % (word, count))
good: 2
hello: 2
morning: 2
CHAR_COUNT:
char_counts = data_reduce.flatMap(lambda each: each[0]).map(lambda char: char).map(lambda c: (c, 1)).reduceByKey(lambda v1, v2: v1 + v2)
char_counts.collect()
[(u'i', 1), (u'm', 1), (u'e', 1), (u'o', 4), (u'g', 2), (u'h', 1), (u'r', 1), (u'n', 2), (u'd', 1), (u'l', 2)]
for (word, count) in char_counts.collect():
... print("%s: %i" % (word, count))
i: 1
m: 1
e: 1
o: 4
g: 2
h: 1
r: 1
n: 2
d: 1
l: 2
No comments:
Post a Comment