[python] first letter count
Viewer
*** This page was generated with the meta tag "noindex, nofollow". This happened because you selected this option before saving or the system detected it as spam. This means that this page will never get into the search engines and the search bot will not crawl it. There is nothing to worry about, you can still share it with anyone.
- import re
- import string
- from pyspark import SparkContext
- def preprocess(token):
- token = re.sub(r'[^\w\s]', '', token) # Remove punctuation
- return token.lower() # Lowercase
- def is_not_empty_string(token):
- return token != ''
- def is_not_number_word(token):
- return token[0].isalpha()
- def extract_first_letter(word):
- return word[0]
- def main():
- sc = SparkContext(appName='SparkFirstLetterCount')
- counts = (
- sc
- .textFile('/user/cloudera/input/Sample-1.txt')
- .flatMap(lambda line: line.split())
- .map(lambda token: preprocess(token))
- .filter(lambda token: is_not_empty_string(token))
- .filter(lambda token: is_not_number_word(token))
- .map(lambda token: (extract_first_letter(token), 1))
- .reduceByKey(lambda a, b: a + b)
- .sortByKey(ascending=True)
- )
- counts.saveAsTextFile('/user/cloudera/output')
- sc.stop()
- if __name__ == '__main__':
- main()
Editor
You can edit this paste and save as new: