I have written a script which calculates lexical diversity and a few other meaningful statistics. My problem is that on some files it fails when it hits what I can only assume is a bad json line. Each of my data files contain strings of json each on their own lines. The meaningful field for these calculations is the “text” field.
My code:
import fileinput
import json
import sys
import os
from collections import defaultdict
line = [] # set to list
tw = 0 # set total words to int
tuw = 0 # set total unique words to int
lexd = 0 # set total lexical diversity to int
awpt = 0 # set average words per tweet to int
line_counter = 0
inputfilename = sys.argv[1] # read the first system argument as the input file name
word_count = defaultdict(int) # set word_count to the default dictionary
for line in fileinput.input([inputfilename]): # FOR each line in the input file
line = line.strip(); # strip any blank lines and throw them out
if not line: continue # if the file does contain a blank line still: in the case of EOF then continue
tweettext = json.loads(line).get('text') # load the line with json.loads and get the "text" field
if not json.loads(line).get('text'): continue # if the line does not contain json data then continue
words = tweettext.split() # split the words from the single line into individual dicts
tw += len(words) # total words counter
line_counter += 1 # total lines counter
print line_counter # so we know what line we're on
for word in words: # FOR each word in the individual line "text" corpus
word_count[word]+=1 # Take the word_count dict, insert the words and incriment
tuw = len(set(word_count)) # calculate the total number of unique words
lexd += 1.0*tuw/tw # calculate the lexical diversity
awpt = 1.0*tuw/line_counter # calc average number of words per tweet
print word_count # print the word list dictionary
print "total number of words", tw # print the total number of words
print "total uniq words", tuw # print the total number of unique words
print "total corpus lexical diversity", lexd # print the total lexical diversity of the entire corpus
print "average number of words per tweet", awpt # print the average number of words per tweet
Sample Data:
{"favorited": false, "in_reply_to_user_id": 213741147, "contributors": null, "truncated": false, "text": "@Rafinha_Angelo sim sim, manda o print l\u00e1 HUSAHUS!", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": "169216950453542912", "coordinates": null, "in_reply_to_user_id_str": "213741147", "entities": {"user_mentions": [{"indices": [0, 15], "screen_name": "Rafinha_Angelo", "id": 213741147, "name": "Rafael A. Figueiredo", "id_str": "213741147"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": 169216950453542912, "id_str": "169217034821976067", "in_reply_to_screen_name": "Rafinha_Angelo", "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/images/themes/theme9/bg.gif", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1769152407/223_normal.JPG", "profile_sidebar_fill_color": "252429", "is_translator": false, "id": 67115876, "profile_text_color": "666666", "followers_count": 310, "profile_sidebar_border_color": "181A1E", "location": "Somewhere.", "default_profile_image": false, "listed_count": 0, "utc_offset": -10800, "statuses_count": 6027, "description": "it's like one more day, with no more things !", "friends_count": 106, "profile_link_color": "2FC2EF", "profile_image_url": "http://a2.twimg.com/profile_images/1769152407/223_normal.JPG", "notifications": null, "show_all_inline_media": false, "geo_enabled": true, "profile_background_color": "1A1B1F", "id_str": "67115876", "profile_background_image_url": "http://a1.twimg.com/images/themes/theme9/bg.gif", "screen_name": "Guiii_Fernandes", "lang": "en", "profile_background_tile": false, "favourites_count": 112, "name": "Guilherme Fernandes", "url": "http://facebook.com/GuiiFernandes", "created_at": "Wed Aug 19 20:43:05 +0000 2009", "contributors_enabled": false, "time_zone": "Brasilia", "protected": false, "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 169217034821976067, "source": "web"}
{"favorited": false, "in_reply_to_user_id": null, "contributors": null, "retweeted_status": {"favorited": false, "in_reply_to_user_id": null, "contributors": null, "truncated": false, "text": "On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Borat voice). Xoxo, JM", "created_at": "Mon Feb 13 23:27:08 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169200965151494144", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 69751644, "description": "", "verified": true, "profile_image_url_https": "https://si0.twimg.com/profile_images/387138234/1_normal.jpg", "profile_sidebar_fill_color": "5c5c5c", "is_translator": false, "geo_enabled": false, "profile_text_color": "333333", "followers_count": 473162, "profile_sidebar_border_color": "00e35f", "id_str": "69751644", "default_profile_image": false, "location": "Los Angeles", "utc_offset": -28800, "statuses_count": 5380, "profile_background_color": "00e35f", "friends_count": 10730, "profile_link_color": "05bcff", "profile_image_url": "http://a0.twimg.com/profile_images/387138234/1_normal.jpg", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/72720138/green.jpg", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/72720138/green.jpg", "screen_name": "jamesmaslow", "lang": "en", "profile_background_tile": false, "favourites_count": 1, "name": "james maslow", "url": "http://www.JamesMaslow.com", "created_at": "Sat Aug 29 01:32:02 +0000 2009", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": false, "following": null, "listed_count": 8348}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169200965151494144, "source": "<a href=\"http://www.osfoora.com\" rel=\"nofollow\">Osfoora for iPhone</a>"}, "truncated": true, "text": "RT @jamesmaslow: On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Bora ...", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [{"indices": [3, 15], "id_str": "69751644", "id": 69751644, "name": "james maslow", "screen_name": "jamesmaslow"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169217034817765377", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 466873377, "description": "Totally dedicate for @1LoganHenderson MINE perfect BTBoy!!!! *--* Rusher for the infinity and beyond and much more beyond!!! Since 01/17/12 =*", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "profile_sidebar_fill_color": "940a2d", "is_translator": false, "geo_enabled": false, "profile_text_color": "eb4466", "followers_count": 103, "profile_sidebar_border_color": "d61153", "id_str": "466873377", "default_profile_image": false, "location": "", "utc_offset": -7200, "statuses_count": 3730, "profile_background_color": "070808", "friends_count": 154, "profile_link_color": "de243d", "profile_image_url": "http://a2.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "profile_background_image_url": "http://a3.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "screen_name": "Logiehbear", "lang": "en", "profile_background_tile": true, "favourites_count": 209, "name": "BBFFF da Laryh!!", "url": null, "created_at": "Tue Jan 17 21:53:17 +0000 2012", "contributors_enabled": false, "time_zone": "Mid-Atlantic", "protected": false, "default_profile": false, "following": null, "listed_count": 1}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169217034817765377, "source": "web"}
Script output:
1
2
defaultdict(<type 'int'>, {u'be': 1, u'is': 1, u'Going': 1, u'in': 2, u'I': 1, u'(said': 1, u'RT': 1, u'huge': 1, u'for': 1, u'l\xe1': 1, u'few': 1, u'Vegas': 1, u'manda': 1, u'print': 1, u'sim,': 1, u'sim': 1, u'On': 1, u'to': 1, u'like!': 1, u'HUSAHUS!': 1, u'rehearsal...this': 1, u'@jamesmaslow:': 1, u'...': 1, u'epic!': 1, u'stage': 1, u'a': 1, u'show.': 1, u'last': 1, u'of': 1, u'days': 1, u'o': 1, u'@Rafinha_Angelo': 1, u'the': 2, u'Bora': 1})
total number of words 36
total uniq words 34
total corpus lexical diversity 0.944444444444
average number of words per tweet 17.0
This runs actually quite fast, however on some of my datasets after a few thousands lines it will fail with:
Traceback (most recent call last):
File "lex.py", line 21, in <module>
tweettext = json.loads(line).get('text') # load the line with json.loads and get the "text" field
File "/usr/lib64/python2.7/json/__init__.py", line 326, in loads
return _default_decoder.decode(s)
File "/usr/lib64/python2.7/json/decoder.py", line 366, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib64/python2.7/json/decoder.py", line 382, in raw_decode
obj, end = self.scan_once(s, idx)
ValueError: Unterminated string starting at: line 1 column 1531 (char 1531)
If the script is getting hung up on the format of the line that it is reading in then I would like to just skip that line and move on. Any suggestions?
I’ve solved this issue using Jesse Harris’ solution of including an exception if json.loads errors out.
When I ran this against my data it resulted in the print out of a gzip’d line of data. As I mentioned in my previous comment this was due to switching to the gzip streaming API from twitter. Two thumbs up to @jesseharris