Python Save Facebook Graph data to MySQL database

In this script I wanted to save public Facebook Pages post data into my local database. See source in github

The main.py script will setup the database structure, retrieve the data from the facebook graph api, then store it into your database.

If you would like only today’s post from the Facebook pages you can use the argument –t without this it will get all public posts available. You can also use -n (number of posts) to limit the amount of post to save.

You will also need to include the page names. Here is an example when you want to save all the public  posts data for a page TacoBell.all_graph_api

Here you only get today’s post:

graph_api

To get multiple pages at once no coma is needed just add all the pages with a space in between.

main.py -p tacobell mcdonalds -t

main.py:


from time import sleep
from time import gmtime
from dateutil import parser
import datetime
from facebook import GraphAPI
import argparse, sys
from urlparse import parse_qs, urlparse
import mysql.connector
from mysql.connector import errorcode
from config import config, access_token

cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
table_name = 'pages'
table = ("CREATE TABLE `pages` ("
"  `id` int(11) NOT NULL AUTO_INCREMENT,"
"  `page_name` varchar(40) NOT NULL,"
"  `post_id` varchar(40) NOT NULL,"
"  `created_time` datetime NOT NULL,"
"  `message` TEXT NOT NULL,"
"  PRIMARY KEY (`id`)"
") ENGINE=InnoDB")
def create_db_table():
cursor.execute('SHOW DATABASES;')
all_dbs = cursor.fetchall()
if all_dbs.count((config['database'],)) == 0:
print("Creating db")
cursor.execute(
"CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(config['database']))

cursor.execute('USE %s;' % config['database'])
cursor.execute("SHOW TABLES;")
all_tables = cursor.fetchall()
if all_tables.count(('pages',)) == 0:
print("creating table")
cursor.execute(table)
add_message = ("INSERT INTO pages "
"(post_id, page_name, created_time, message) "
"VALUES (%s, %s,%s, %s)")
def insert_post(post_id, page_name, created_time, message):
cursor.execute("SELECT * FROM pages WHERE post_id=%s;", (post_id,))
if cursor.fetchall():
print("already saved this post")
return False
print('saving this post')
cursor.execute(add_message, (post_id, page_name, created_time, message))
cnx.commit()
import calendar
from datetime import datetime, timedelta
def make_it_good(s):
res = ""
for ch in s:
try:
res += ch.decode('utf8')
except Exception as e:
pass
return res
def utc_to_local(utc_dt):
# get integer timestamp to avoid precision lost
timestamp = calendar.timegm(utc_dt.timetuple())
local_dt = datetime.fromtimestamp(timestamp)
assert utc_dt.resolution >= timedelta(microseconds=1)
return local_dt.replace(microsecond=utc_dt.microsecond)
def save_posts(page, num_post=None, only_today=False):
if not num_post:
num_post = 1000000000
graph = GraphAPI(access_token)
arguments = {}
total = 0
num_saved = 0
while 1:
try:
posts = graph.get_connections(page, 'posts', **arguments)
except Exception as e:
try:
sleep(3)
posts = graph.get_connections(page, 'posts', **arguments)
except Exception as e:
print("Retrieved a total of %d posts" % total)
print("But some error happened after that")
print("This is the error")
print(e)
print("Breaking...")
break
next = None
if 'paging' in posts:
next = posts['paging']['next']
posts = posts['data']
total += len(posts)
any = False
for post in posts:
if 'message' in post:
created_time = parser.parse(post['created_time'])
created_time = utc_to_local(created_time)
today = datetime.now()
if only_today:
if today.year == created_time.year and today.month == created_time.month and \
today.day == created_time.day:
any = True
else:
continue
num_saved += 1
insert_post(post['id'], page, created_time, make_it_good(post['message']))
if only_today and not any:
break
if next:
qs = parse_qs(urlparse(next).query)
arguments = {'limit': qs['limit'][0], 'until': qs['until'][0]}
else:
break
print('done')
print('Saved total of %d posts' % num_saved)
create_db_table()
def main():
parser = argparse.ArgumentParser(
description='Saving facebook pages post to a mysql db ')
parser.add_argument("-p", nargs='+', help="Please specify the name of page")
parser.add_argument('-n',
help="Please specify number of posts you want to save to db( if no argument is supplied all "
"posts will be saved, note this might take some time")
parser.add_argument('-t', action='store_true', help="Specify if you want to get only today posts")
args = parser.parse_args()
only_today = args.t
pages = args.p
n = args.n
if not pages:
parser.print_usage()
sys.exit(1)
if n:
n = int(n)
if only_today:
print("Saving today posts")
for page in pages:
print("Saving posts for %s" % page)
save_posts(page, n, only_today)
if (__name__ == '__main__'):
main()

 

Config file:


config={
'user':'root',
'host':127.0.0.1,
'password':'mypass',
'database':'analytics'
}
access_token='token'

Python Tweepy Twitter – Mysql query

I wanted to do some research with public tweets using tweepy. Basically what I wanted to do is save all the tweets to my local mysql database instance that met my certain criteria. The criteria / filter was a table I have within my database.

I decided to use tweepy, find it on github  here.

Check out Brandon Wood’s article here where I based this off of.


# twitter client
import tweepy

# database interface
import mysql.connector
conn = mysql.connector.connect(user='root', password='mypass', host='127.0.0.1', database='analytics')
curs = conn.cursor()

class StreamWatcherHandler(tweepy.StreamListener):

 def on_status(self, status):

 
 try:
 usr = status.author.screen_name.strip()
 txt = status.text.strip()
 in_reply_to = status.in_reply_to_status_id
 src = status.source.strip()
 cat = status.created_at

 # Now that we have our tweet information, let's stow it away in our sql database
 curs.execute("insert into tweets (username, created_at, content, reply_to, source) values(%s, %s, %s, %s, %s)", 
 (usr, cat, txt, in_reply_to, src))
 conn.commit()
 except Exception as e:
 # Most errors we're going to see relate to the handling of UTF-8 messages (sorry)
 print(e)

 def on_error(self, status_code):
 print('An error has occured! Status code = %s' % status_code)
 return True

def main():
 # establish stream
 consumer_key = "key"
 consumer_secret = "secret"
 auth1 = tweepy.auth.OAuthHandler(consumer_key, consumer_secret)

 access_token = "token"
 access_token_secret = "token_secret"
 auth1.set_access_token(access_token, access_token_secret)

#This is where we define our filter from our database. Each cell will contain a keyword that I want the stream to look for 
 curs.execute('SELECT name FROM mytable');
 row = [item[0] for item in curs.fetchall()]

 print "Establishing stream...",
 stream = tweepy.Stream(auth1, StreamWatcherHandler(), timeout=None)
 stream.filter(track=row)
 print "Done"

if __name__ == '__main__':
 try:
 main()
 except KeyboardInterrupt:
 print "Disconnecting from database... ",
 conn.commit()
 conn.close()
 print "Done"

Here is what the database table looks like:

mytable_data

 

So now the stream is only looking for tweets that have “test”, “test2”, or “test3” keywords.