- from textblob import TextBlob
- import matplotlib.pyplot as plt
- from pyspark import SparkConf, SparkContext
- import re
- import string
- ##OTHER FUNCTIONS/CLASSES
- def resolve_emoticon(line):
- emoticon = {
- ':-)' : 'smile',
- ':)' : 'sad',
- ':))' : 'very happy',
- ':)' : 'happy',
- ':((' : 'very sad',
- ':(' : 'sad',
- ':-P' : 'tongue',
- ':-o' : 'gasp',
- '>:-)':'angry'
- }
- for key in emoticon:
- line = line.replace(key, emoticon[key])
- return line
- def abb_bm(line):
- abbreviation_bm = {
- 'sy': 'saya',
- 'sk': 'suka',
- 'byk': 'banyak',
- 'sgt' : 'sangat',
- 'mcm' : 'macam',
- 'bodo':'bodoh',
- 'kat' : 'dekat'
- }
- abbrev = ' '.join (abbreviation_bm.get(word, word) for word in line.split())
- return (resolve_emoticon(abbrev))
- def abb_en(line):
- abbreviation_en = {
- 'u': 'you',
- 'thr': 'there',
- 'asap': 'as soon as possible',
- 'lv' : 'love',
- 'c' : 'see'
- }
- abbrev = ' '.join (abbreviation_en.get(word, word) for word in line.split())
- return (resolve_emoticon(abbrev))
- def make_plot(pos,neg):
- #This function plots the counts of positive and negative words
- Polarity = [1,2]
- LABELS = ["Positive", "Negative"]
- Count_polarity = [int(pos), int(neg)]
- plt.xlabel('Polarity')
- plt.ylabel('Count')
- plt.title('Sentiment Analysis - Lexical Based')
- plt.grid(True)
- plt.bar(Polarity, Count_polarity, align='center')
- plt.xticks(Polarity, LABELS)
- plt.show()
- def remove_features(data_str): #by niewan
- url_re = re.compile(r'https?://(\S+)')
- num_re = re.compile(r'(\d+)')
- mention_re = re.compile(r'(@|#)(\w+)')
- RT_re = re.compile(r'RT(\s+)')
- data_str = str(data_str)
- data_str = RT_re.sub('', data_str) # remove RT
- data_str = url_re.sub('', data_str) # remove hyperlinks
- data_str = mention_re.sub('', data_str) # remove @mentions and hash
- data_str = num_re.sub('', data_str) # remove numerical digit
- data_str = resolve_emoticon(data_str) # replace emoji
- return data_str.lower()
- def main(sc,filename): #by niewan
- # CODE IT YOURSELF
- rdd = sc.textFile(filename).map(lambda text: remove_features(text)) #remove and replace
- rdd_en = rdd.filter(lambda text: TextBlob(text).detect_language() == 'en').map(lambda text: abb_en(text)) #filter to english
- rdd_ms = rdd.filter(lambda text: TextBlob(text).detect_language() == 'ms').map(lambda text: abb_bm(text)).map(lambda text: str(TextBlob(text).translate(to='en'))) #filter to bahasa and translate to en
- rdd = rdd_en.union(rdd_ms)
- positive_rdd = rdd.filter(lambda text: TextBlob(text).sentiment.polarity > 0)
- negative_rdd = rdd.filter(lambda text: TextBlob(text).sentiment.polarity < 0)
- make_plot(int(positive_rdd.count()),int(negative_rdd.count())) #the cast is just to ensure the value is in integer data type
- if __name__ == "__main__":
- # Configure your Spark environment
- conf = SparkConf().setMaster("local[*]").setAppName("My Spark Application")
- sc = SparkContext(conf=conf)
- # CODE IT YOURSELF
- filename = "simple_sentences.txt"
- resolve_emoticon(line)
- abb_bm(line)
- abb_en(line)
- make_plot(pos,neg)
- remove_features(data_str)
- main(sc, filename)
- sc.stop()
[text] handons 4
Viewer
*** This page was generated with the meta tag "noindex, nofollow". This happened because you selected this option before saving or the system detected it as spam. This means that this page will never get into the search engines and the search bot will not crawl it. There is nothing to worry about, you can still share it with anyone.
Editor
You can edit this paste and save as new: