[python] first letter count

Viewer

copydownloadembedprintName: first letter count
  1. import re
  2. import string
  3.  
  4. from pyspark import SparkContext
  5.  
  6.  
  7. def preprocess(token):
  8.     token = re.sub(r'[^\w\s]', '', token)  # Remove punctuation
  9.     return token.lower()  # Lowercase 
  10.  
  11.  
  12. def is_not_empty_string(token):
  13.     return token != ''
  14.  
  15.  
  16. def is_not_number_word(token):
  17.     return token[0].isalpha()
  18.  
  19.  
  20. def extract_first_letter(word):
  21.     return word[0]
  22.  
  23.  
  24. def main():
  25.  
  26.     sc = SparkContext(appName='SparkFirstLetterCount')
  27.  
  28.     counts = (
  29.         sc
  30.         .textFile('/user/cloudera/input/Sample-1.txt')
  31.         .flatMap(lambda line: line.split())
  32.         .map(lambda token: preprocess(token))
  33.         .filter(lambda token: is_not_empty_string(token))
  34.         .filter(lambda token: is_not_number_word(token))
  35.         .map(lambda token(extract_first_letter(token), 1))
  36.         .reduceByKey(lambda a, b: a + b)
  37.         .sortByKey(ascending=True)
  38.     )
  39.     counts.saveAsTextFile('/user/cloudera/output')
  40.  
  41.     sc.stop()
  42.  
  43. if __name__ == '__main__':
  44.     main()

Editor

You can edit this paste and save as new:


File Description
  • first letter count
  • Paste Code
  • 28 Feb-2021
  • 1008 Bytes
You can Share it: