comment

import json import datetime import csv import time import ssl from utils import request_until_succeed, open_csv_w from secrets import FACEBOOK_APP_ID, FACEBOOK_APP_SECRET # to get comments from a page you will need to run the other scripts first context = ssl._create_unverified_context() # this file_id must be the same as the ids you used to compile the posts using the other facebook scripts file_id = "brooklynmuseum" # get authentication access_token = FACEBOOK_APP_ID + "|" + FACEBOOK_APP_SECRET # Needed to write tricky unicode correctly to csv def unicode_normalize(text): return text.translate({ 0x2018:0x27, 0x2019:0x27, 0x201C:0x22, 0x201D:0x22, 0xa0:0x20 }).encode('utf-8') def getFacebookCommentFeedData(status_id, access_token, num_comments): # Construct the URL string base = "https://graph.facebook.com/v2.9" node = "/%s/comments" % status_id fields = "?fields=id,message,like_count,created_time,comments,from,attachment" parameters = "&order=chronological&limit=%s&access_token=%s" % \ (num_comments, access_token) url = base + node + fields + parameters # retrieve data data = request_until_succeed(url) if data is None: return None else: return json.loads(data) def processFacebookComment(comment, status_id, parent_id = ''): # The status is now a Python dictionary, so for top-level items, # we can simply call the key. # Additionally, some items may not always exist, # so must check for existence first comment_id = comment['id'] comment_message = '' if 'message' not in comment else \ unicode_normalize(comment['message']) comment_author = unicode_normalize(comment['from']['name']) comment_likes = 0 if 'like_count' not in comment else \ comment['like_count'] if 'attachment' in comment: attach_tag = "[[%s]]" % comment['attachment']['type'].upper() comment_message = attach_tag if comment_message is '' else \ (comment_message.decode("utf-8") + " " + \ attach_tag).encode("utf-8") # Time needs special care since a) it's in UTC and # b) it's not easy to use in statistical programs. comment_published = datetime.datetime.strptime( comment['created_time'],'%Y-%m-%dT%H:%M:%S+0000') comment_published = comment_published + datetime.timedelta(hours=-5) # EST comment_published = comment_published.strftime( '%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs # Return a tuple of all processed data return (comment_id, status_id, parent_id, comment_message, comment_author, comment_published, comment_likes) def scrapeFacebookPageFeedComments(page_id, access_token): # with open('%s_facebook_comments.csv' % file_id, 'wb') as file: with open_csv_w('../output/%s_facebook_comments.csv' % file_id) as file: w = csv.writer(file) w.writerow(["comment_id", "status_id", "parent_id", "comment_message", "comment_author", "comment_published", "comment_likes"]) num_processed = 0 # keep a count on how many we've processed scrape_starttime = datetime.datetime.now() print("Scraping %s Comments From Posts: %s\n" % \ (file_id, scrape_starttime)) # with open('%s_facebook_statuses.csv' % file_id, 'rb') as csvfile: with open_csv_w('../output/%s_facebook_statuses.csv' % file_id, 'rb') as csvfile: reader = csv.DictReader(csvfile) #reader = [dict(status_id='759985267390294_1158001970921953')] for status in reader: has_next_page = True comments = getFacebookCommentFeedData(status['status_id'], access_token, 100) while has_next_page and comments is not None: for comment in comments['data']: w.writerow(processFacebookComment(comment, status['status_id'])) if 'comments' in comment: has_next_subpage = True subcomments = getFacebookCommentFeedData( comment['id'], access_token, 100) while has_next_subpage: for subcomment in subcomments['data']: # print (processFacebookComment( # subcomment, status['status_id'], # comment['id'])) w.writerow(processFacebookComment( subcomment, status['status_id'], comment['id'])) num_processed += 1 if num_processed % 1000 == 0: print("%s Comments Processed: %s" % \ (num_processed, datetime.datetime.now())) if 'paging' in subcomments: if 'next' in subcomments['paging']: subcomments = json.loads( request_until_succeed( subcomments['paging']\ ['next'])) else: has_next_subpage = False else: has_next_subpage = False # output progress occasionally to make sure code is not # stalling num_processed += 1 if num_processed % 1000 == 0: print("%s Comments Processed: %s" % \ (num_processed, datetime.datetime.now())) if 'paging' in comments: if 'next' in comments['paging']: comments = json.loads(request_until_succeed( comments['paging']['next'])) else: has_next_page = False else: has_next_page = False print("\nDone!\n%s Comments Processed in %s" % \ (num_processed, datetime.datetime.now() - scrape_starttime)) if __name__ == '__main__': scrapeFacebookPageFeedComments(file_id, access_token) # The CSV can be opened in all major statistical programs. Have fun! :)

Comments

Popular posts from this blog

HOW TO EARN THROUGH CONTENT CREATION?