#!/usr/bin/env python2.4 import ConfigParser import math import os.path import pickle import re import sys import time import urllib2 import twitter import pygooglechart as gchart config = ConfigParser.ConfigParser() config.read('/home/fweez/tweetalyzer/tweetalyzer.cfg') USERNAME = config.get('auth', 'username') PASSWD = config.get('auth', 'passwd') API_ABUSE_LEVEL = config.get('other', 'abuse') USERDATA = config.get('data', 'users') ANALYSIS = config.get('data', 'analysis') TEMPLATE = config.get('data', 'template') URL = config.get('output', 'baseurl') SM_X = int(config.get('output', 'small_imgwidth')) LG_X = int(config.get('output', 'large_imgwidth')) SM_Y = int(config.get('output', 'imgheight')) DOMSGS = config.get('output', 'dm_when_done') class Tweetalyzer(object): chatre = re.compile("@([a-zA-Z0-9]+)") emoticons = [ ':)', ':(', 'XP', ':P', '(:', '):', '^_^' ] # need more def __init__(self, userid, reason, api): self.userid = userid self.reason = reason self.api = api self.analysis = None def update_user(self): """If we've never seen this user, download their account. If we have, update the db.""" try: datafile = open(USERDATA + '/%d' % self.userid, 'r') userdata = pickle.load(datafile) since_id = userdata['last_update'] except IOError: userdata = None since_id = None timeline = self.api.GetUserTimeline(user=self.userid, count=API_ABUSE_LEVEL, since_id=since_id) if timeline: self.analyze(timeline, analysis=userdata) else: self.analysis=userdata def analyze(self, timeline, analysis=None): """Crunches numbers""" if not analysis: analysis = { 'userdict' : None, 'last_update' : None, 'chatters' : {}, 'emoticons' : {}, 'clients' : {}, 'lendist' : {}, # text length 'hoddist' : [0]*24, # hour of day 'dowdist' : {}, # day of week 'domdist' : {}, # day of month 'tbpdist' : {}, # time between posts 'fulldist' : {}, # full-timeline view 'last_tweet_time' : None, 'shortest' : None, 'longest' : None, 'emoticoned' : 0, 'response_count' : 0, 'twoosh_count' : 0, 'total_len' : 0, 'count' : 0, 'tweetdata' : [] } if not analysis['userdict']: # we need to look up their extended user info analysis['userdict'] = self.api.GetUser(self.userid).AsDict() hrs_from_utc = analysis['userdict'].get('utc_offset', 0) / 3600 # process oldest-first timeline.sort(lambda a,b: cmp(a.GetCreatedAtInSeconds(), b.GetCreatedAtInSeconds())) for t in timeline: txt = t.GetText() l = len(txt) datestr = t.GetCreatedAt() analysis['tweetdata'].append(t.AsDict()) analysis['total_len'] += l analysis['count'] += 1 analysis['last_update'] = t.GetId() if l == 140: analysis['twoosh_count'] += 1 if txt.startswith("@"): analysis['response_count'] += 1 # top chat partners for u in set(self.chatre.findall(txt)): analysis['chatters'][u] = analysis['chatters'].get(u, 0) + 1 # Distribution of len(t) analysis['lendist'][l] = analysis['lendist'].get(l, 0) + 1 # distribution by hour in day hod = int(datestr[11:13]) + hrs_from_utc if hod >= 0 and hod <= 23: analysis['hoddist'][hod] += 1 # distribution by day of week dow = datestr[0:3] analysis['dowdist'][dow] = analysis['dowdist'].get(dow, 0) + 1 # ... of month dom = datestr[8:10] analysis['domdist'][dom] = analysis['domdist'].get(dom, 0) + 1 # distribution of time between posts if analysis['last_tweet_time']: diff = t.GetCreatedAtInSeconds() - analysis['last_tweet_time'] analysis['tbpdist'][diff] = analysis['tbpdist'].get(diff, 0) + 1 analysis['last_tweet_time'] = t.GetCreatedAtInSeconds() # full (day-resolution) timeline # first, reduce seconds resolution to day level timelist = list(time.localtime(t.GetCreatedAtInSeconds())) timelist = timelist[:3] + [0]*3 + timelist[6:] day = time.mktime(timelist) analysis['fulldist'][day] = analysis['fulldist'].get(day, 0) + 1 # shortest t if not analysis['shortest'] or \ l < len(analysis['shortest'].GetText()): analysis['shortest'] = t # longest t if not analysis['longest'] or \ l > len(analysis['longest'].GetText()): analysis['longest'] = t # posts with emoticons has_crap = False for crap in self.emoticons: if crap in txt: has_crap = True analysis['emoticons'][crap] = \ analysis['emoticons'].get(crap, 0) + 1 if has_crap: analysis['emoticoned'] += 1 # client distribution client = t.raw['source'] analysis['clients'][client] = analysis['clients'].get(client, 0) + 1 # url'd, and most-used domains # most-used words # least-used words # markov-chain-generated t print "Processed", analysis['userdict']['screen_name'] pickle.dump(analysis, open(USERDATA + '/%d' % self.userid, 'w')) self.analysis = analysis def responses_chart(self): chart = gchart.PieChart2D(SM_Y * 3, SM_Y * 2) chart.add_data( (self.analysis['response_count'], self.analysis['count'] - self.analysis['response_count'])) chart.set_pie_labels(['@']) return chart.get_url() def twoosh_chart(self): chart = gchart.PieChart2D(SM_Y * 3, SM_Y * 2) chart.add_data( (self.analysis['twoosh_count'], self.analysis['count'] - self.analysis['twoosh_count'])) chart.set_pie_labels(['140']) return chart.get_url() def emoticons_chart(self): chart = gchart.PieChart2D(SM_Y * 3, SM_Y * 2) chart.add_data( (self.analysis['emoticoned'], self.analysis['count'] - self.analysis['emoticoned'])) chart.set_pie_labels([':)']) return chart.get_url() def lendist_chart(self): x_data = self.analysis['lendist'].keys() x_data.sort() y_data = [ self.analysis['lendist'][x] for x in x_data ] chart = gchart.XYLineChart(SM_X, SM_Y, x_range=(0,140), y_range=(0,max(y_data))) chart.add_data(x_data) chart.add_data(y_data) chart.set_axis_labels(gchart.Axis.LEFT, ('', max(y_data))) chart.set_axis_labels(gchart.Axis.BOTTOM, (0,140)) return chart.get_url(gchart.TextData) def hoddist_chart(self): chart = gchart.SimpleLineChart( SM_X, SM_Y, y_range=(0, max(self.analysis['hoddist'])), x_range=(0,23)) labels = range(0,24) for i in range(len(labels)): if i % 6: labels[i] = '' chart.add_data(self.analysis['hoddist']) chart.set_axis_labels(gchart.Axis.BOTTOM, labels) chart.set_axis_labels(gchart.Axis.LEFT, ('', max(self.analysis['hoddist']))) return chart.get_url() def dowdist_chart(self): chart = gchart.SimpleLineChart( SM_X, SM_Y, y_range=(0, max(self.analysis['dowdist'].values()))) realdays = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] chart.add_data([ self.analysis['dowdist'].get(day, 0) for day in realdays ]) labeldays = [ 's', 'm', 't', 'w', 't', 'f', 's'] chart.set_axis_labels(gchart.Axis.BOTTOM, labeldays) chart.set_axis_labels(gchart.Axis.LEFT, ('', max(self.analysis['hoddist']))) return chart.get_url() def tbpdist_chart(self): seconds = 0 minutes = 0 hours = 0 days = 0 weeks = 0 months = 0 for time, count in self.analysis['tbpdist'].iteritems(): if time < 60: seconds += count elif time < 60 * 60: minutes += count elif time < 60 * 60 * 24: hours += count elif time < 60 * 60 * 24 * 7: days += count elif time < 60 * 60 * 24 * 7 * 4: weeks += count else: months += count data = [ seconds, minutes, hours, days, weeks, months ] chart = gchart.StackedVerticalBarChart(SM_X, SM_Y, y_range=(0, max(data))) chart.add_data(data) chart.set_bar_width(SM_X / 8) chart.set_axis_labels(gchart.Axis.BOTTOM, ("seconds", "minutes", "hours", "days", "weeks", "months")) chart.set_axis_labels(gchart.Axis.LEFT, ('', max(data))) return chart.get_url() def fulldist_chart(self): x_data = self.analysis['fulldist'].keys() x_data.sort() y_data = [ self.analysis['fulldist'][x] for x in x_data ] # get starting day and ending day as x-axis start and end labels start = list(time.localtime(x_data[0])[0:3]) + [0]*6 end = list(time.localtime(x_data[-1])[0:2]) + \ [time.localtime(x_data[-1])[2] + 1] + [0]*6 # either google or firefox doesn't like big urls if len(x_data) > 150: # so, compress the data compression_ratio = float(150) / (len(x_data) - 150) if (compression_ratio < 2): compression_ratio = 2 compression_ratio = int(math.ceil(compression_ratio)) print "Compressing", self.userid, \ self.analysis['userdict']['screen_name'], \ compression_ratio for i in range(0, len(x_data), compression_ratio): if i - 1 >= 0: target_x = x_data[i - 1] self.analysis['fulldist'][target_x] += y_data[i] / 2 if i + 1 < len(x_data): target_x = x_data[i + 1] self.analysis['fulldist'][target_x] += y_data[i] for i in range(0, len(x_data), compression_ratio): del(self.analysis['fulldist'][x_data[i]]) return self.fulldist_chart() # scale data down by hand maxx = max(x_data) - x_data[0] x_data = [ (x - x_data[0]) / float(maxx) for x in x_data ] chart = gchart.XYLineChart(LG_X, SM_Y, x_range=(0, int(x_data[-1])), y_range=(int(min(y_data)), int(max(y_data)))) chart.add_data(x_data) chart.add_data(y_data) chart.set_axis_labels(gchart.Axis.LEFT, (0, max(y_data))) chart.set_axis_labels(gchart.Axis.BOTTOM, ("%d/%d" % (start[1], start[0]), "%d/%d" % (end[1], end[0]))) return chart.get_url(gchart.TextData) def save_page(self): '''Save the analysis, all formatted purty-like''' templ = open(TEMPLATE) callbacks = { 'FULLDIST': self.fulldist_chart(), 'HODDIST' : self.hoddist_chart(), 'DOWDIST' : self.dowdist_chart(), 'TBPDIST' : self.tbpdist_chart(), 'LENDIST' : self.lendist_chart(), 'REPLIES' : self.responses_chart(), 'TWOOSH' : self.twoosh_chart(), 'NUMTWEETS': str(self.analysis['count']), 'USERNAME': self.analysis['userdict']['screen_name'], } for thing in ('chatters', 'clients'): things = self.analysis[thing].items() things.sort(lambda a,b: cmp(b[1], a[1])) things = "\n".join([ "
  • %s (%d)
  • " %( c[0], c[1]) for c in things[:5] ]) callbacks[thing.upper()] = things output = open(ANALYSIS + '/%s.html' % self.userid, 'w') for line in templ: for thing in callbacks.keys(): replstr = "%" + thing + "%" if replstr in line: line = line.replace(replstr, callbacks[thing]) output.write(line) if DOMSGS: self.api.PostDirectMessage( self.userid, "You can has analysis! %s%s.html (sent because: %s)" % (URL, self.userid, self.reason)) class TABot(object): """Downloads private messages to the account and runs analyses""" def __init__(self): self.api = twitter.Api(USERNAME, PASSWD) self.to_analyze = [] def check_privmsgs(self): """Check the bot's private messages for requests""" followers = self.api.GetFollowers() friends = self.api.GetFriends() for follower in followers: userdata = USERDATA + "/" + str(follower.GetId()) if follower not in friends: if os.path.exists(userdata): print "Not friended, but I have userdata for " \ "%s (%s). Unlinking userdata." % ( follower.GetId(), follower.GetScreenName()) os.unlink(userdata) continue follower = self.api.GetUser(follower.GetId()) if follower.suspended: print follower.GetScreenName(), "is suspended, " \ "not friending." continue try: print "Friending", follower.GetScreenName() self.api.CreateFriendship(follower.GetId()) self.to_analyze.append(("new user", follower.GetId())) continue except urllib2.HTTPError: print "Couldn't friend", follower.GetScreenName() continue if follower.suspended: print follower.GetScreenName(), "is suspended, unfriending." self.api.DestroyFriendship(follower.GetId()) continue if not os.path.exists(userdata): try: self.to_analyze.append(("no user data cache", follower.GetId())) except: print "Couldn't analyze", follower.GetScreenName() for dm in self.api.GetDirectMessages(): if dm.GetSenderId() in self.to_analyze: continue self.to_analyze.append(("sent me a dm", dm.GetSenderId())) self.api.DestroyDirectMessage(dm.GetId()) def run(self): self.check_privmsgs() for t in self.to_analyze: reason, userid = t analyzer = Tweetalyzer(userid, reason, self.api) updater = analyzer.update_user() analyzer.save_page() if __name__ == "__main__": bot = TABot() for i in range(1, len(sys.argv)): bot.to_analyze.append(("specified on command line", int(sys.argv[i]))) bot.run()