HOME/Articles/

matplotlib example Nielsen2012Numpy quotes (snippet)

Article Outline

Python matplotlib example 'Nielsen2012Numpy quotes'

Functions in program:

  • def sentiment(text, norm='sqrt'):

Modules used in program:

  • import copy
  • import re
  • import numpy as np
  • import nltk.corpus
  • import matplotlib.finance
  • import matplotlib.dates
  • import datetime
  • import dateutil.parser
  • import simplejson as json
  • import urllib, urllib2

python Nielsen2012Numpy quotes

Python matplotlib example: Nielsen2012Numpy quotes

import urllib, urllib2
import simplejson as json
import dateutil.parser
import datetime
import matplotlib.dates
import matplotlib.finance
from matplotlib import pyplot as plt
import nltk.corpus 
import numpy as np
import re
import copy


companies = {
    'Novo Nordisk': {'stock': 'NVO', 'wikipedia': 'Novo_Nordisk'},
    'Pfizer': {'stock': 'PFE', 'wikipedia': 'Pfizer'} 
    }


filebase = '/home/fn/'

# Sentiment word list
# AFINN-111 is as of June 2011 the most recent version of AFINN
filename_afinn = filebase + '/data/AFINN/AFINN-111.txt'
afinn = dict(map(lambda (w, s): (unicode(w, 'utf-8'), int(s)), [ 
            ws.strip().split('\t') for ws in open(filename_afinn) ]))

stopwords = nltk.corpus.stopwords.words('english')
stopwords = dict(zip(stopwords, stopwords))


# Word splitter pattern
pattern_split = re.compile(r"[^\w-]+", re.UNICODE)

def sentiment(text, norm='sqrt'):
    """
    Sentiment analysis.
    (sentiment, arousal, ambivalence, positive, negative) = sentiment(test)
    """
    words_with_stopwords = pattern_split.split(text.lower())
    # Exclude stopwords:
    words = filter(lambda w: not stopwords.has_key(w), words_with_stopwords)
    sentiments = map(lambda word: afinn.get(word, 0), words)
    keys = ['sentiment', 'arousal', 'ambivalence', 'positive', 'negative']
    if sentiments:
        sentiments = np.asarray(sentiments).astype(float)
        sentiment = np.sum(sentiments)
        arousal = np.sum(np.abs(sentiments))
        ambivalence = arousal - np.abs(sentiment)
        positive = np.sum(np.where(sentiments>0, sentiments, 0))
        negative = - np.sum(np.where(sentiments<0, sentiments, 0))
        result = np.asarray([sentiment, arousal, ambivalence, positive, negative])
        if norm == 'mean':
            result /= len(sentiments)
        elif norm == 'sum':
            pass
        elif norm == 'sqrt':
            result /= np.sqrt(len(sentiments))
        else:
            raise("Wrong ''norm'' argument")
    else:
        result = (0, 0, 0, 0, 0)
    return dict(zip(keys, result))


today = datetime.date.today()

# Matplotlib x-axis date formatting
days_locations = matplotlib.dates.DayLocator()
months_locations = matplotlib.dates.MonthLocator()
months_formatter = matplotlib.dates.DateFormatter("%Y %b")

# Prepare URL and download for Wikipedia
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Finn Aarup Nielsen, +45 45 25 39 21')]
urlbase = "http://en.wikipedia.org/w/api.php?"


for company, fields in companies.items():
    wikipedia_revisions = []
    urlparam = {'action': 'query', 
                'format': 'json', 
                'prop': 'revisions',
                'rvlimit': 50,
                'rvprop': 'ids|timestamp|content', 
                'titles': fields['wikipedia']}
    for i in range(7):
        url = urlbase + urllib.urlencode(urlparam)
        wikipedia_result = json.load(opener.open(url))
        wikipedia_revisions.extend(wikipedia_result['query']['pages'].values()[0]['revisions'])
        print("%s: %d" % (company, len(wikipedia_revisions)))
        if 'query-continue' in wikipedia_result: 
            urlparam.update(wikipedia_result['query-continue']['revisions'])
        else:
            break
    wikipedia_last_timestamp = wikipedia_revisions[-1]['timestamp']
    wikipedia_last_datetime = dateutil.parser.parse(wikipedia_last_timestamp)
    wikipedia_last_date = datetime.datetime.date(wikipedia_last_datetime)
    for n, revision in enumerate(wikipedia_revisions):
        wikipedia_revisions[n].update(sentiment(revision['*']))
    companies[company].update({'wikipedia_revisions': copy.deepcopy(wikipedia_revisions)})
    companies[company].update({'quotes': matplotlib.finance.quotes_historical_yahoo(fields['stock'], wikipedia_last_date, today)})
    xaxis_range = matplotlib.dates.date2num(wikipedia_last_date), matplotlib.dates.date2num(today)
    fig = plt.figure() 
    for i in range(1,3):
        ax = fig.add_subplot(2, 1, i)
        ax.xaxis.set_major_locator(months_locations)
        ax.xaxis.set_minor_locator(days_locations)
        ax.xaxis.set_major_formatter(months_formatter)
        if i == 1:
            quotes = companies[company]['quotes']
            h = matplotlib.finance.candlestick(ax, quotes)
            h = plt.ylabel('Stock prize')
            h = plt.title(company)
        else:
            x = map(lambda fs: matplotlib.dates.date2num(dateutil.parser.parse(fs['timestamp'])), wikipedia_revisions) 
            y = map(lambda fs: fs['sentiment'], wikipedia_revisions)
            h = plt.plot(x, y)
            h = plt.xlabel('Date')
            h = plt.ylabel('Wikipedia sentiment')
        h = ax.set_xlim(xaxis_range)
        fig.autofmt_xdate()
    plt.show()