#!/usr/bin/python # -*- coding: utf-8 -*- # Copyright 2015 Dirk Ceuppens. All Rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # This software integrates large parts from the following scripts: # - search_analytics_sample.py that can be found on # https://github.com/google/google-api-python-client/blob/master/samples/searchconsole/search_analytics_api_sample.py # - the code sampe HelloAnalytics.py that can be found on # https://developers.google.com/analytics/devguides/reporting/core/v3/quickstart/service-py # # These scripts are used under the following licence: # # Copyright 2015 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ''' To use: 1) Check the detailed instructions on http://www.trackmedia.be/en/blog/getting-rid-of-not-provided.html 2) Run the script in the command-line as shown below. Sample usage: $ python notprovided.py 'https://www.example.com' '2015-05-01' '2015-05-30' 'xxxxxx' = your site = start date = end date = Google Analytics View ID ''' import argparse import csv import sys import time import random from collections import defaultdict import logging import simplejson import socket from apiclient.discovery import build from oauth2client.client import SignedJwtAssertionCredentials import httplib2 from oauth2client import client from oauth2client import file from oauth2client import tools from googleapiclient import sample_tools from googleapiclient import errors # Declare command-line flags. # Parse the arguments in to variables argparser = argparse.ArgumentParser(add_help=False) argparser.add_argument('property_uri', type=str, help=('Site or app URI to query data for (including ' 'trailing slash).')) argparser.add_argument('start_date', type=str, help=('Start date of the requested date range in ' 'YYYY-MM-DD format.')) argparser.add_argument('end_date', type=str, help=('End date of the requested date range in ' 'YYYY-MM-DD format.')) argparser.add_argument('view_id', type=str, help=('Analytics View id')) ###### Analytics functions def get_service(api_name, api_version, scope, key_file_location, service_account_email): """Get a service that communicates to a Google API. Args: api_name: The name of the api to connect to. api_version: The api version to connect to. scope: A list auth scopes to authorize for the application. key_file_location: The path to a valid service account p12 key file. service_account_email: The service account email address. Returns: A service that is connected to the specified API. """ f = open(key_file_location, 'rb') key = f.read() f.close() credentials = SignedJwtAssertionCredentials(service_account_email, key, scope=scope) http = credentials.authorize(httplib2.Http()) # Build the service object. service = build(api_name, api_version, http=http) return service def get_results(service, profile_id,startdate,enddate): # Use the Analytics Service Object to query the Core Reporting API # for the number of Google sessions within the requested timeframe return service.data().ga().get( ids='ga:' + profile_id, start_date=startdate, end_date=enddate, metrics='ga:sessions', filters='ga:source==google').execute() def get_landingpages(service, profile_id,startdate,enddate,next_index, max_requests_api): # Use the Analytics Service Object to query the Core Reporting API # for the number of sessions - bounce% - avg Session DUration - Pageviews/session for each of the landingpages within the requested timeframe # Max_results is 1000 if no value is given - the API returns maximum 10.000 results per request return service.data().ga().get( ids='ga:' + profile_id, start_date=startdate, end_date=enddate, metrics='ga:sessions,ga:bounces,ga:avgSessionDuration,ga:pageviewsPerSession', dimensions='ga:landingPagePath', sort='-ga:sessions', # sort sessions descending filters='ga:source==google', max_results= max_requests_api, start_index=next_index).execute() def print_results(results): # Print data nicely for the user. if results: print 'View (Profile): %s' % results.get('profileInfo').get('profileName') print 'Total Sessions: %s' % results.get('rows')[0][0] else: print 'No results found' def save_analytics(response,flags): """Saves results in csv file. Each row contains landingpage, sessions, bounces Args: response: The server response to be printed as a table. """ newrows =[] for row in response: url=row[0].encode('utf-8') newrow = (url, row[1], row[2], row[3],row[4]) newrows.append(newrow) basic_url = flags.property_uri basic_url = basic_url.replace("://","_") basic_url =basic_url.replace(".","-") filename = "Landingpage_analytics_"+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv" f = open("./"+filename, 'a') print "Saving analytics results to %s" % filename print "./Landingpage_analytics_"+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv" writer = csv.writer(f) writer.writerows(newrows) f.close() def print_landingpages(results): # Print data nicely for the user. if results: print 'View (Profile): %s' % results.get('profileInfo').get('profileName') print 'Total Google Sessions: %s' % results.get('rows') else: print 'No results found' def get_analytics_data(argv,): flags = argparser.parse_args() # Define the auth scopes to request. scope = ['https://www.googleapis.com/auth/analytics.readonly'] # Use the developer console and replace the values with your # service account email and relative location of your key file. service_account_email = '455813885970-8fo4pv73qg0t2d998lbd6jn8rrbg2tuf@developer.gserviceaccount.com' key_file_location = './Analytics-b7428c9767fd.p12' # Authenticate and construct service. service = get_service('analytics', 'v3', scope, key_file_location, service_account_email) profile = flags.view_id basic_url = flags.property_uri basic_url = basic_url.replace("://","_") basic_url =basic_url.replace(".","-") # Create blank csv file f= open("./Landingpage_analytics_"+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv", 'wt') writer = csv.writer(f) writer.writerow( ('Landingpage', 'Sessions', 'Bounces', 'avg. Session Duration', 'avg. Pages/Session') ) f.close() max_requests_api = 10000 # The Google Analytics API has a limit of 10.000 requests per call max_landingpages_needed = 20000 # If you have more landingpages - this number can be updated next_index = 1 print "Requesting first batch of Analytics data " # Send the first request to the Analytics API - max. number of requests is limited by the API result = get_landingpages(service,profile,flags.start_date,flags.end_date, next_index, min(max_requests_api,max_landingpages_needed)) # Return the total number of landingpages total_number_of_pages = result['totalResults'] print "Total number of pages %s" % total_number_of_pages next_index = result['query']['max-results'] + 1 landingpages_table = result['rows'] # If the total number of landingpages of analytics is bigger than 10.000 (max. allowed by API) then request next # batch of data until we reach the maximum of pages that are required (default: 20.000) maximum_pages = min (max_landingpages_needed,total_number_of_pages) while next_index < maximum_pages: additional_results = get_landingpages(service,profile,flags.start_date,flags.end_date, next_index, max_requests_api) print "Requesting additional data - starting at result %s" %next_index next_index = next_index + additional_results['query']['max-results'] landingpages_table = landingpages_table + additional_results['rows'] save_analytics(landingpages_table,flags) landingpages=[] for row in landingpages_table: newrow=(flags.property_uri+row[0],row[1],row[2],row[3],row[4]) landingpages.append(newrow) print "Analytics data collected." return(landingpages) ####### Search console functions def get_search_console_data(argv,landingpages): service, flags = sample_tools.init( argv, 'webmasters', 'v3', __doc__, __file__, parents=[argparser], scope='https://www.googleapis.com/auth/webmasters.readonly') # create initial blank file with title basic_url = flags.property_uri basic_url = basic_url.replace("://","_") basic_url =basic_url.replace(".","-") f= open("./Landingpage_keyword_"+basic_url+"_"+flags.start_date+"_"+flags.end_date+".csv", 'wt') writer = csv.writer(f) writer.writerow( ('Keyword', 'Landingpage', 'Clicks', 'Impressions', 'CTR', 'Position') ) f.close() # First run a query to learn which dates we have data for. You should always # check which days in a date range have data before running your main query. # This query shows data for the entire range, grouped and sorted by day, # descending; any days without data will be missing from the results. request = { 'startDate': flags.start_date, 'endDate': flags.end_date, 'dimensions': ['date'] } response = execute_request(service, flags.property_uri, request) # Get totals for the date range. request = { 'startDate': flags.start_date, 'endDate': flags.end_date } response = execute_request(service, flags.property_uri, request) print_table(response, 'Totals') print "Retrieving top 1000 keywords from search console" # Get top queries for the date range, sorted by click count, descending. request = { 'startDate': flags.start_date, 'endDate': flags.end_date, 'dimensions': ['query'], #'rowLimit': 10 } response = execute_request(service, flags.property_uri, request) save_table(response, 'Keyword_'+flags.start_date+"_"+flags.end_date+"_"+basic_url,"wt","Keyword") print "Done - data saved in "+'Keyword_'+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv" print "Retrieving top 1000 landingpages from search console" # Get top pages for the date range, sorted by click count, descending. request = { 'startDate': flags.start_date, 'endDate': flags.end_date, 'dimensions': ['page'], #'rowLimit': 10 } response = execute_request(service, flags.property_uri, request) save_table(response, 'Landingpage_'+flags.start_date+"_"+flags.end_date+"_"+basic_url, "wt","Landingpage") print "Done - data saved in "+'Landingpage_'+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv" print "Generating Keyword-Landingpage report" # Get the Search Console Date for the landingpages retrieved from Analytics loop(landingpages,service,flags) print "Now checking additional Keywords:" get_additional_data(flags,service,"Keyword") print "Now checking additional Landingpages" get_additional_data(flags,service,"Landingpage") # Final step - put all the collected data in one file update_file(flags) def loop(response, service, flags): for rij in response: page = (rij[0]) visits = (rij[1]) if float(visits) >= 5: print "Checking result for %s" % page #print "visits: %s" % visits request = { 'startDate': flags.start_date, 'endDate': flags.end_date, 'dimensions': ['query'], 'dimensionFilterGroups': [{ 'filters': [{ 'dimension': 'page', 'expression': page }] }] } n=0 request_ok = False while (n<10 and not(request_ok)): try: response = execute_request(service, flags.property_uri, request) request_ok= True save_global(response, page, flags) except errors.HttpError, e: n=n+1 logging.warning("Reason of error %s - page: %s", e, page) # Apply exponential backoff. time.sleep((2 ** n) + random.randint(0, 1000) / 1000) logging.warning("Trying %s of 10 trials", n) except socket.error, e: n=n+1 logging.warning("Caught exception socket error: %s - page: %s", e, page) time.sleep((2 ** n) + random.randint(0, 1000) / 1000) logging.warning("Trying %s of 10 trials", n) except Exception as e: n=n+1 logging.warning("Caught other error") logging.error(traceback.format_exc()) time.sleep((2 ** n) + random.randint(0, 1000) / 1000) logging.warning("Trying %s of 10 trials", n) if not(request_ok): # If after 10 requests the API fails to return data the script stops print "There have been too many errors while trying to contact the API, quitting." logging.error("There have been too many errors while trying to contact the API, quitting.") sys.exit("Script terminated. Try again later") def execute_request(service, property_uri, request): """Executes a searchAnalytics.query request. Args: service: The webmasters service to use when executing the query. property_uri: The site or app URI to request data for. request: The request to be executed. Returns: An array of response rows. """ return service.searchanalytics().query( siteUrl=property_uri, body=request).execute() def print_table(response, title): """Prints out a response table. Each row contains key(s), clicks, impressions, CTR, and average position. Args: response: The server response to be printed as a table. title: The title of the table. """ print title + ':' if 'rows' not in response: return rows = response['rows'] row_format = '{:<20}' + '{:>20}' * 4 print row_format.format('Keyword', 'Landingpage', 'Clicks', 'Impressions', 'CTR', 'Position') for row in rows: keys = '' # Keys are returned only if one or more dimensions are requested. if 'keys' in row: keys = u','.join(row['keys']).encode('utf-8') print row_format.format( keys, row['clicks'], row['impressions'], row['ctr'], row['position']) def save_global(response, keyword,flags): """Saves results in csv file. Each row contains kewyord, landingpage, clicks, impressions, CTR, and average position. Args: response: The server response to be printed as a table. """ if 'rows' not in response: return newrows =[] rows = response['rows'] for row in rows: keys = '' # Keys are returned only if one or more dimensions are requested. if 'keys' in row: keys = u','.join(row['keys']).encode('utf-8') newrow = (keys, keyword, row['clicks'], row['impressions'], row['ctr'], row['position']) newrows.append(newrow) basic_url = flags.property_uri basic_url = basic_url.replace("://","_") basic_url =basic_url.replace(".","-") f = open("./Landingpage_keyword_"+basic_url+"_"+flags.start_date+"_"+flags.end_date+".csv", 'a') writer = csv.writer(f) writer.writerows(newrows) f.close() def save_table(response, title, mode,data_type): '''Prints out a response table. Each row contains key(s), clicks, impressions, CTR, and average position. Args: response: The server response to be printed as a table. title: The title of the table. ''' if 'rows' not in response: return rows = response['rows'] newrows =[] for row in rows: keys = '' # Keys are returned only if one or more dimensions are requested. if 'keys' in row: keys = u','.join(row['keys']).encode('utf-8') newrow = (keys, row['impressions'], row['clicks'], row['position']) newrows.append(newrow) f = open("./"+ title + ".csv", mode) writer = csv.writer(f) if mode == "wt": writer.writerow( (data_type, 'impressions', 'clicks', 'avg_position') ) writer.writerows(newrows) f.close() def get_additional_data(flags,service,data_type): # First get the keywords from the landingpage - keyword report that have minimum 5 clicks # save the results in csv # type can be "Keyword" or "Landingpage" basic_url = flags.property_uri basic_url = basic_url.replace("://","_") basic_url =basic_url.replace(".","-") vol_data=[] with open("./Landingpage_keyword_"+basic_url+"_"+flags.start_date+"_"+flags.end_date+".csv") as csvfile: reader = csv.DictReader(csvfile) for row in reader: if float(row['Clicks'])> 1: vol_data.append(row[data_type]) # Then get the keywords from the top 1000 queries from Search Console initial_data=[] with open(data_type+"_"+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv") as csvfile: reader = csv.DictReader(csvfile) for row in reader: initial_data.append(row[data_type]) # Now - deduplicate the list from the landingpage-keyword report - then delete the keywords/landingpages that have already been queried: # Store them in data_to_check data_to_check = list(set(vol_data)-set(initial_data)) if data_type=="Keyword": dimension = 'query' if data_type=="Landingpage": dimension = 'page' print dimension for rij in data_to_check: key = (rij) print "Checking data for %s" % key request = { 'startDate': flags.start_date, 'endDate': flags.end_date, 'dimensions': [dimension], 'dimensionFilterGroups': [{ 'filters': [{ 'dimension': dimension, 'expression': key }] }] } n=0 request_ok = False while (n<10 and not(request_ok)): try: response = execute_request(service, flags.property_uri, request) save_table(response, data_type+"_"+flags.start_date+"_"+flags.end_date+"_"+basic_url,"a",data_type) request_ok= True except errors.HttpError, e: n=n+1 logging.warning("Reason of error %s - data: %s", e, key) # Apply exponential backoff. time.sleep((2 ** n) + random.randint(0, 1000) / 1000) print "Trying %s of 10 trials" % n logging.warning("Trying %s of 10 trials", n) except socket.error, e: n=n+1 logging.warning("Caught exception socket error: %s - data %s ", e, key) time.sleep((2 ** n) + random.randint(0, 1000) / 1000) logging.warning("Trying %s of 10 trials", n) except Exception as e: n=n+1 logging.warning("Caught other error") logging.error(traceback.format_exc()) time.sleep((2 ** n) + random.randint(0, 1000) / 1000) logging.warning("Trying %s of 10 trials", n) if not(request_ok): print "There have been too many errors while trying to contact the API, quitting." logging.error("There have been too many errors while trying to contact the API, quitting.") sys.exit("Script terminated. Try again later.") def update_file(flags): #Combining all the data previously collected and stored in 3 seperate csv files into one global csv file print "Merging files" basic_url = flags.property_uri basic_url = basic_url.replace("://","_") basic_url =basic_url.replace(".","-") matchfile ="Landingpage_analytics_"+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv" matchfile2 ='Landingpage_'+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv" matchfile3 ="Keyword_"+flags.start_date+"_"+flags.end_date+"_"+basic_url+".csv" csvfile="Landingpage_keyword_"+basic_url+"_"+flags.start_date+"_"+flags.end_date+".csv" newfile="Global_report_"+basic_url+"_"+flags.start_date+"_"+flags.end_date+".csv" print matchfile print matchfile2 print matchfile3 new_data=[] new_data2=[] new_data3=[] with open(csvfile) as fin, open(matchfile) as fmatch, open(matchfile2) as fmatch2, open(matchfile3) as fmatch3, open(newfile,'wb') as fout: reader = csv.reader(fin) checker = csv.reader(fmatch) checker2 = csv.reader(fmatch2) checker3 = csv.reader(fmatch3) writer = csv.writer(fout) header_reader = next(reader) checker.next() checker2.next() checker3.next() header_checker =["Clicks Keyword", "Impressions Keyword","Avg. Pos. Keyword","CTR Keyword","Clicks Landingpage","Impressions Landingpage","Avg. Pos. Landingpage","CTR Landingpage","GA Sessions","GA Bouncerate","GA Avg session duration","GA Avg. Pages/Session"] new_header = header_reader + header_checker writer.writerow(new_header) analytics = {} keyword = {} landingpage = {} for a in checker3: keyword[a[0]]=[a[2],a[1],a[3],float(a[2])/float(a[1])] for b in checker2: landingpage[b[0]]=[b[2],b[1],b[3],float(b[2])/float(b[1])] for xl in checker: url = flags.property_uri+xl[0] analytics[url] = [xl[1],float(xl[2])/float(xl[1]),xl[3],xl[4]] for i in reader: if i[0] in keyword: newrow = i + keyword[i[0]] else: # if keyword doesn't have specific data newrow = i + ['N/A', 'N/A', 'N/A', 'N/A'] new_data.append(newrow) for i in new_data: if i[1] in landingpage: newrow = i + landingpage[i[1]] else: # if landingpage doesn't have specific data newrow = i + ['N/A', 'N/A', 'N/A', 'N/A'] new_data2.append(newrow) for i in new_data2: if i[1] in analytics: newrow = i + analytics[i[1]] new_data3.append(newrow) new_data3.sort(key=lambda row: float(row[2]), reverse=True) writer.writerows(new_data3) if __name__ == '__main__': logging.basicConfig(filename='notprovided.log', level=logging.WARNING) logging.info('Started') #get the top landing pages from the Analytics API results = get_analytics_data(sys.argv) # Send the landing pages to the Search console API get_search_console_data(sys.argv,results) logging.info('Finished')