Last Updated: December 30, 2017
·
2.961K
· hur1can3

Scrape Foursquare API to get venue information

import urllib
import urllib2
import json
import datetime
import pandas as pd
from pandas.io.json import json_normalize
import math
import time
from math import cos
from pandas import DataFrame

### Helper function for converting meters to lat/long

def distcust(p, d, lat_m, long_m):
    lat = p['lat']
    long = p['long']

    lat1 = lat + lat_m * (d / (11100.0/90*1000) * cos(lat))
    long1 = long + long_m * (d / (11100.0/90*1000))

    return {'lat': lat1, 'long': long1}

client_id = "YOUR_CLIENT_ID"
client_secret = "YOUR_CLIENT_SECRET"
#p = {'lat': 37.7833, 'long': -122.4167}    # central San Francisco, at Van Ness and Market
#p = {'lat': 40.783011, 'long': -73.965368} # central NYC, at Central Park
p = {'lat': 42.963601, 'long': -85.66878} # grand rapids, mi  at division and fulton 
distance = 100
limit = 50
gridSize = 10
df = DataFrame()
requested_keys = ["categories","id","location","name"]
category = "bar"
category_id = "4d4b7105d754a06376d81259"

for x in [x1 / 10.0 for x1 in range(-3*gridSize, 3*gridSize)]:
    for y in [y1 / 10.0 for y1 in range(-3*gridSize, 3*gridSize)]:
        center = distcust(p,distance,x,y)
        url = "https://api.foursquare.com/v2/venues/search?ll=%s,%s&intent=browse&radius=%s&categoryId=%s&client_id=%s&client_secret=%s&v=%s" % (center["lat"], center["long"], distance, category_id, client_id, client_secret, time.strftime("%Y%m%d"))
        try:
            req = urllib2.Request(url)
            response = urllib2.urlopen(req)
            data = json.loads(response.read())
            response.close()
            #print data["response"]['venues']
            data = DataFrame(data["response"]['venues'])[requested_keys]


            df2 = DataFrame()
            venue_ids = []
            frames = []

            #print data["id"]
            for d in data["id"]:                
                requested_keys2 = ["id", "price.currency","rating", "likes.count"]

                url2 = "https://api.foursquare.com/v2/venues/%s?client_id=%s&client_secret=%s&v=%s" % (d, client_id, client_secret, time.strftime("%Y%m%d"))
                req2 = urllib2.Request(url2)
                response2 = urllib2.urlopen(req2)
                data2 = json.loads(response2.read())
                response.close()
                ddata = data2['response']               

                nom_data = json_normalize(ddata['venue'])

                if "price.currency" not in nom_data.columns:
                    nom_data["price.currency"] = 'NONE'

                if "rating" not in nom_data.columns:
                    nom_data["rating"] = 'NONE'                 

                venue_ids.append(d)
                frames.append(nom_data[requested_keys2])
                #print "getting attr for %s" % nom_data["name"]
                time.sleep(1)


            df2 = pd.concat(frames, keys=venue_ids)

            mdata = pd.merge(data, df2,how='left',on='id', suffixes=('_x', '_y'))

            #print mdata

            df = df.append(mdata,ignore_index=True)
            #print df

            #df.to_csv("test.csv")

            print center
            time.sleep(1) # stay within API limits
        except Exception, e:
            print e

df = df.drop_duplicates(cols='id',take_last=True)
print df

df["categories"] = df["categories"].apply(lambda x: dict(x[0])['name'])
df["lat"] = df["location"].apply(lambda x: dict(x)["lat"])
df["long"] = df["location"].apply(lambda x: dict(x)["lng"])
df["distance"] = df["location"].apply(lambda x: dict(x)["distance"])
df["checkins"] = df["stats"].apply(lambda x: dict(x)["checkinsCount"])

ordered_df = df[["id_x","name_x","categories","checkins", "distance","lat","long", "price.currency", "rating", "likes.count"]]
ordered_df.to_csv("foursquare_%s_grand_rapids.csv" % category,encoding='utf-8', index=False)

1 Response
Add your response

It throws error as "HTTP Error 400: Bad Request",, any reason?

over 1 year ago ·