jeudi 3 septembre 2015

Twitter user data extraction in sqlite Database

I am trying to get twitter user information into a table name typhoon inside Sqlite database.

import sys
import urllib
import string
import simplejson
import sqlite3

import time
import datetime
from pprint import pprint

import sqlalchemy
from sqlalchemy.orm import mapper, sessionmaker
from sqlalchemy import Column, Integer, String, VARCHAR, ForeignKey, Text, DateTime, Unicode, Float # importing Unicode is important! If not, you likely encounter data type error.
from sqlalchemy.ext.declarative import declarative_base
from types import *

from datetime import datetime, date, time

from twython import Twython
t = Twython(app_key='',       #REPLACE 'APP_KEY' WITH YOUR APP KEY, ETC., IN THE NEXT 4 LINES
    app_secret='',
    oauth_token='',
    oauth_token_secret='')

Base = declarative_base()

class TWEET(Base):
    __tablename__ = 'typhoon' # This is table name, you can change to yours
    rowid = Column(Integer, primary_key=True)  
    query = Column(String)
    user_type = Column (String)
    #tweet_id = Column(Integer) 
    tweet_id = Column(String) 
    #to_user_id = Column(String)
    inserted_date = Column(DateTime)
    truncated = Column(String)
    language = Column(String)
    possibly_sensitive = Column(String)  ### NEW 
    coordinates = Column(String)                    
    retweeted_status = Column(String)
    withheld_in_countries = Column(String)
    withheld_scope = Column(String)
    created_at_text = Column(String)                
    created_at = Column(DateTime)
    month = Column(String)
    year = Column(String)
    content = Column(Text)
    from_user_screen_name = Column(String)          
    from_user_id = Column(String)   
    from_user_followers_count = Column(Integer)     
    from_user_friends_count = Column(Integer)       
    from_user_listed_count = Column(Integer)        
    from_user_favourites_count = Column(Integer)    
    from_user_statuses_count = Column(Integer)  
    from_user_description = Column(String)          
    from_user_location = Column(String)             
    from_user_created_at = Column(String)           
    retweet_count = Column(Integer)
    favorite_count = Column(Integer)                
    entities_urls = Column(Unicode(255))
    entities_urls_count = Column(Integer)        
    entities_hashtags = Column(Unicode(255))
    entities_hashtags_count = Column(Integer)    
    entities_mentions = Column(Unicode(255))    
    entities_mentions_count = Column(Integer)  
    in_reply_to_screen_name = Column(String)  
    in_reply_to_status_id = Column(String)  
    source = Column(String)
    entities_expanded_urls = Column(Text) 
    json_output = Column(String)
    entities_media_count = Column(Integer)
    media_expanded_url = Column(Text) 
    media_url = Column(Text) 
    media_type = Column(Text) 
    video_link = Column(Integer)
    photo_link = Column(Integer)
    twitpic = Column(Integer)
    num_characters = Column(Integer)                    
    num_words = Column(Integer)                         
    #hashtags = Column(Unicode(255))
    #direct = Column(Unicode(255))
    #initial_direct = Column(Unicode(255))
    #initial_rt = Column(Unicode(255))
    #clicks = Column(Integer)    
    #total_clicks = Column(Integer)    

    def __init__(self, query, user_type, tweet_id, inserted_date, truncated, language, possibly_sensitive, coordinates, 
    retweeted_status, withheld_in_countries, withheld_scope, created_at_text, created_at, month, year, content, 
    from_user_screen_name, from_user_id, from_user_followers_count, from_user_friends_count,   
    from_user_listed_count, from_user_favourites_count, from_user_statuses_count, from_user_description,   
    from_user_location, from_user_created_at, retweet_count, favorite_count, entities_urls, entities_urls_count,         
    entities_hashtags, entities_hashtags_count, entities_mentions, entities_mentions_count,   
    in_reply_to_screen_name, in_reply_to_status_id, source, entities_expanded_urls, json_output, 
    entities_media_count, media_expanded_url, media_url, media_type, video_link, photo_link, twitpic, 
    num_characters, num_words,  
    ):
    # make sure the screen names in the quoto match all names after self. and in class.
        self.query = query
        self.user_type = user_type
        self.tweet_id = tweet_id
        self.inserted_date = inserted_date
        #self.to_user_id =  to_user_id
        self.truncated = truncated
        self.language = language
        self.possibly_sensitive = possibly_sensitive
        self.coordinates = coordinates
        self.retweeted_status = retweeted_status
        self.withheld_in_countries = withheld_in_countries
        self.withheld_scope = withheld_scope
        self.created_at_text = created_at_text
        self.created_at = created_at 
        self.month = month
        self.year = year
        self.content = content
        self.from_user_screen_name = from_user_screen_name
        self.from_user_id = from_user_id       
        self.from_user_followers_count = from_user_followers_count
        self.from_user_friends_count = from_user_friends_count
        #self.from_user_friends_name = from_user_friends_name
        self.from_user_listed_count = from_user_listed_count
        self.from_user_favourites_count = from_user_favourites_count
        self.from_user_statuses_count = from_user_statuses_count
        self.from_user_description = from_user_description
        self.from_user_location = from_user_location
        self.from_user_created_at = from_user_created_at
        self.retweet_count = retweet_count
        self.favorite_count = favorite_count
        self.entities_urls = entities_urls
        self.entities_urls_count = entities_urls_count        
        self.entities_hashtags = entities_hashtags
        self.entities_hashtags_count = entities_hashtags_count
        self.entities_mentions = entities_mentions
        self.entities_mentions_count = entities_mentions_count     
        self.in_reply_to_screen_name = in_reply_to_screen_name
        self.in_reply_to_status_id = in_reply_to_status_id
        self.source = source
        self.entities_expanded_urls = entities_expanded_urls
        self.json_output = json_output
        self.entities_media_count = entities_media_count
        self.media_expanded_url = media_expanded_url
        self.media_url = media_url
        self.media_type = media_type
        self.video_link = video_link
        self.photo_link = photo_link
        self.twitpic = twitpic
        self.num_characters = num_characters
        self.num_words = num_words


    def __repr__(self):
       return "<sender, created_at('%s', '%s')>" % (self.from_user_screen_name,self.created_at)

# This is for defining a table that has a list of screen names to be mined.         
class ACCOUNT(Base):
    __tablename__ = 'Accounts1' # this is the table name for a list of scree names to be mined. You need to go to SQLite Database browser and create a new DB (make sure that DB's name matches the one defined in this script); within that DB, create a table, make sure the table name and field names match the ones defined here. 
    rowid = Column(Integer, primary_key=True)     
    screen_name = Column(String)  
    user_type = Column(String) # for categorizing user's attribute (e.g. organization or individual)

    def __init__(self, screen_name, user_type
    ):       

        self.screen_name = screen_name
        self.user_type = user_type


    def __repr__(self):
       return "<Company, CSR_account('%s', '%s')>" % (self.rowid, self.screen_name)


def get_data(kid):
    try:        

        d = t.get_user_timeline(screen_name=kid, count="1", page="1", include_entities="true", include_rts="1")  #NEW LINE        

    except Exception as e:

        print ("Error reading id %s, exception: %s" % (kid, e))
        return None

        print (len(d),)
        return d

def write_data(self, d, screen_name, user_type):
    query = screen_name
    user_type = user_type   
    for entry in d:             
        print (user_type)
        json_output = str(entry)


        tweet_id = entry['id']
        inserted_date = datetime.now()  



        truncated = entry['truncated']
        language = entry['lang']



        if 'possibly_sensitive' in entry:
            possibly_sensitive= entry['possibly_sensitive']
        else:
            possibly_sensitive = ''

        coordinates = []
        if 'coordinates' in entry and entry['coordinates'] != None:
            print (entry['coordinates']['coordinates'])

            coordinates = ', '.join(map(str, entry['coordinates']['coordinates']))  
            print (coordinates)

        else:
            coordinates = ''

        if 'retweeted_status' in entry:
            retweeted_status = 'THIS IS A RETWEET'
        else:
            retweeted_status = ''  

        if 'withheld_in_countries' in entry:
            withheld_in_countries = 'WITHHELD --> CHECK JSON'
        else:
            withheld_in_countries = ''

        if 'withheld_scope' in entry:
            withheld_scope = entry['withheld_scope']
        else:
            withheld_scope = ''


        content = entry['text']
        content = content.replace('\n','')       

        created_at_text = entry['created_at']    


        created_at = datetime.strptime(created_at_text, '%a %b %d %H:%M:%S +0000 %Y')  

        created_at2 = created_at.strftime('%Y-%m-%d %H:%M:%S')  

        month = created_at.strftime('%m')
        year = created_at.strftime('%Y')
        print ('month', month, 'year', year)



        from_user_screen_name = entry['user']['screen_name']

        from_user_id = entry['user']['id'] 
        from_user_followers_count = entry['user']['followers_count']
        from_user_friends_count = entry['user']['friends_count']   
        from_user_listed_count = entry['user']['listed_count']
        from_user_favourites_count = entry['user']['favourites_count']
        print ('from_user_favourites_count-------------->', from_user_favourites_count)
        from_user_statuses_count = entry['user']['statuses_count'] 
        from_user_description = entry['user']['description'] 
        from_user_location = entry['user']['location'] 
        from_user_created_at = entry['user']['created_at']

        retweet_count = entry['retweet_count'] 
        favorite_count = entry['favorite_count']

        in_reply_to_screen_name = entry['in_reply_to_screen_name']
        in_reply_to_status_id = entry['in_reply_to_status_id']




        entities_urls_count = len(entry['entities']['urls'])    
        entities_hashtags_count = len(entry['entities']['hashtags'])   
        entities_mentions_count = len(entry['entities']['user_mentions']) 

        source = entry['source']




        entities_urls, entities_expanded_urls, entities_hashtags, entities_mentions = [], [], [], []


        for link in entry['entities']['urls']:
            if 'url' in link:
                url = link['url']
                expanded_url = link['expanded_url']
                #print (link['url'], link['expanded_url'])
                entities_urls.append(url)
                entities_expanded_urls.append(expanded_url)
            else:
                print ("No urls in entry")

        for hashtag in entry['entities']['hashtags']:
            if 'text' in hashtag:
                tag = hashtag['text']
                #print hashtag['text']
                entities_hashtags.append(tag)
            else:
                print ("No hashtags in entry")

        for at in entry['entities']['user_mentions']:
            if 'screen_name' in at:
                mention = at['screen_name']
                #print at['screen_name']
                entities_mentions.append(mention)
            else:
                print ("No mentions in entry")

        entities_mentions = string.join(entities_mentions, u", ")
        entities_hashtags = string.join(entities_hashtags, u", ")
        entities_urls = string.join(entities_urls, u", ")
        entities_expanded_urls = string.join(entities_expanded_urls, u", ")    

        video_link = 0
        if 'vimeo' in entities_expanded_urls or 'youtube' in entities_expanded_urls or 'youtu' in entities_expanded_urls or 'vine' in entities_expanded_urls:
            video_link = 1                  #All of these videos show up in 'View Media' in tweets
            print ("FOUND A VIDEO!!!")
        else:
            video_link = 0


        if 'twitpic' in entities_expanded_urls:
            twitpic = 1                     #twitpic images show up in 'View Media' in tweets
            print ("FOUND A TWITPIC LINK!")
        else:
            twitpic = 0
        if 'twitpic' in entities_expanded_urls or 'instagram' in entities_expanded_urls or 'instagr' in entities_expanded_urls:
            photo_link = 1                  #instagram images DO NOT show up in 'View Media' in tweets
            print ("FOUND A TWITPIC OR INSTAGRAM LINK!!!")
        else:
            photo_link = 0


        entities_urls = unicode(entities_urls)
        entities_expanded_urls = unicode(entities_expanded_urls)
        entities_hashtags = unicode(entities_hashtags)
        entities_mentions = unicode(entities_mentions)

        print ("urls...?....",) #entities_urls 
        print ("user_mentions...?....",) #entities_mentions
        print ("hashtags...?....",) #entities_hashtags


    if 'symbols' in entry['entities']:
        print ("HERE ARE THE SYMBOLS.......",) #entry['entities']['symbols']
    else:
        print ("THERE AIN'T NO entry['entities']['symbols']")

    if 'media' in entry['entities']:
        print ("HERE ARE THE MEDIA.......",) 
        entities_media_count = len(entry['entities']['media'])   

    else:
        entities_media_count = ''





    if 'media' in entry['entities']:
        if 'expanded_url' in entry['entities']['media'][0]:
            media_expanded_url = entry['entities']['media'][0]['expanded_url']
        else:
            print ("THERE AIN'T NO expanded_url in entry['entities']['media']")
            media_expanded_url = ''
        if 'media_url' in entry['entities']['media'][0]:
            media_url = entry['entities']['media'][0]['media_url']
        else:
            print ("THERE AIN'T NO media_url in entry['entities']['media']")
            media_url = ''
        if 'type' in entry['entities']['media'][0]:
            media_type = entry['entities']['media'][0]['type']
        else:
            print ("THERE AIN'T NO type in entry['entities']['media']")
            media_type = ''
    else:
        media_type = ''
        media_url = ''
        media_expanded_url = ''



        updates = self.session.query(TWEET).filter_by(query=query, from_user_screen_name=from_user_screen_name,content=content).all() 

        if not updates:
            print ("inserting, query:", query)                   

            upd = TWEET(query, user_type, tweet_id, inserted_date, truncated, language, possibly_sensitive, 
                coordinates, retweeted_status, withheld_in_countries, withheld_scope, created_at_text, 
                created_at, month, year, content, from_user_screen_name, from_user_id, from_user_followers_count, 
                from_user_friends_count, from_user_listed_count, from_user_favourites_count, from_user_statuses_count, from_user_description,   
                from_user_location, from_user_created_at, retweet_count, favorite_count, entities_urls, entities_urls_count,         
                entities_hashtags, entities_hashtags_count, entities_mentions, entities_mentions_count,   
                in_reply_to_screen_name, in_reply_to_status_id, source, entities_expanded_urls, json_output, 
                entities_media_count, media_expanded_url, media_url, media_type, video_link, photo_link, twitpic, None, None,
                )
            self.session.add(upd)


        else:
            if len(updates) > 1:
                print ("Warning: more than one update matching to_user=%s, text=%s")\
                       % (to_user, content)
            else:
                print ("Not inserting, dupe..")




##class Scrape:
##    def __init__(self):    
##        engine = sqlalchemy.create_engine("sqlite:///C:/Python34/Lib/site-packages/The Curiosity Bites Python Tutorial - getting Twitter user profiles.py", echo=False)  # enter your file path here.
##        #engine = sqlalchemy.create_engine("sqlite:///python_beginnerNEW.sqlite", echo=False)  # enter your file path here.
##        #engine = sqlalchemy.create_engine("sqlite:///python_beginner.sqlite", echo=False)  # enter your file path here.
##        #engine = sqlalchemy.create_engine("sqlite:///C:/python34/The Curiosity Bites Python Tutorial - getting Twitter user profiles.py", echo=False)  # enter your file path here.
##        Session = sessionmaker(bind=engine)
##       
##        self.session = Session()  
##        Base.metadata.create_all(engine)

class Scrape:
    def __init__(self):    
        #engine = sqlalchemy.create_engine("sqlite:///C:/Python34/Lib/site-packages/The Curiosity Bites Python Tutorial - getting Twitter user profiles.py", echo=False)  # enter your file path here.
        #engine = sqlalchemy.create_engine("sqlite:///python_beginnerNEW.sqlite", echo=False)  # enter your file path here.
        engine = sqlalchemy.create_engine("sqlite:///python_beginner.sqlite", echo=False)  # enter your file path here.
        #engine = sqlalchemy.create_engine("sqlite:///C:/python34/The Curiosity Bites Python Tutorial - getting Twitter user profiles.py", echo=False)  # enter your file path here.
        Session = sessionmaker(bind=engine)

        self.session = Session()  
        Base.metadata.create_all(engine)


    def main(self):

        all_ids = self.session.query(ACCOUNT).all()

        keys = []
        for i in all_ids[0:]: # starting from the first (1) row
        #for i in all_ids:
            screen_name = i.screen_name
            kid = screen_name               #THIS ROW IS DIFFERENT FROM THE MENTIONS AND DMS FILES
            rowid = i.rowid
            user_type = i.user_type

By here My code is running fine at Python shell it shows me processing id 1/14...to 14/14 But when I see the database table. Its nothing in there.I am unable to get that why its not writing the values in created typhoon table.

            print ("\rprocessing id %s/%s  --  %s" % (rowid, len(all_ids), screen_name),)
            #sys.stdout.flush()


            d = get_data(kid)
            if not d:
                continue         


            if len(d)==0:               
                print ("THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID")
                continue



            write_data(self, d, screen_name, user_type)   

            self.session.commit()


        self.session.close()



if __name__ == "__main__":
    s = Scrape()
    s.main()

Please help me dumping the data into the database. Thanks in Advance

Aucun commentaire:

Enregistrer un commentaire