import pandas as pd
from pytrends.request import TrendReq
%matplotlib inline
pip install matplotlib-inline
Requirement already satisfied: matplotlib-inline in c:\users\jki\anaconda3\lib\site-packages (0.1.6) Requirement already satisfied: traitlets in c:\users\jki\anaconda3\lib\site-packages (from matplotlib-inline) (5.7.1) Note: you may need to restart the kernel to use updated packages.
pip install pytrends
Requirement already satisfied: pytrends in c:\users\jki\anaconda3\lib\site-packages (4.9.2) Requirement already satisfied: requests>=2.0 in c:\users\jki\anaconda3\lib\site-packages (from pytrends) (2.31.0) Requirement already satisfied: pandas>=0.25 in c:\users\jki\anaconda3\lib\site-packages (from pytrends) (1.5.3) Requirement already satisfied: lxml in c:\users\jki\anaconda3\lib\site-packages (from pytrends) (4.9.2) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\jki\anaconda3\lib\site-packages (from pandas>=0.25->pytrends) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\jki\anaconda3\lib\site-packages (from pandas>=0.25->pytrends) (2022.7) Requirement already satisfied: numpy>=1.21.0 in c:\users\jki\anaconda3\lib\site-packages (from pandas>=0.25->pytrends) (1.24.3) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\jki\anaconda3\lib\site-packages (from requests>=2.0->pytrends) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\jki\anaconda3\lib\site-packages (from requests>=2.0->pytrends) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\jki\anaconda3\lib\site-packages (from requests>=2.0->pytrends) (1.26.16) Requirement already satisfied: certifi>=2017.4.17 in c:\users\jki\anaconda3\lib\site-packages (from requests>=2.0->pytrends) (2023.7.22) Requirement already satisfied: six>=1.5 in c:\users\jki\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas>=0.25->pytrends) (1.16.0) Note: you may need to restart the kernel to use updated packages.
# import Google Analytics Data
ga_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_keyword.csv")
ga_data.head(5)
# ---------------------------------------- | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | # Miratrix | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | # Channels | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | # 20180831-20190831 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | # ---------------------------------------- | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# import google analytcis keyword data
ga_kw_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_keyword.csv",skiprows = 6, nrows =5)
ga_kw_traffic_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_keyword.csv",skiprows = 14, nrows =366,
parse_dates = ['Day Index'])
ga_kw_data.head(4)
Keyword | Users | New Users | Sessions | Bounce Rate | Pages / Session | Avg. Session Duration | Contact From Good Lead (Goal 1 Conversion Rate) | Contact From Good Lead (Goal 1 Completions) | Contact From Good Lead (Goal 1 Value) | |
---|---|---|---|---|---|---|---|---|---|---|
0 | (not provided) | 615 | 591 | 727 | 35.90% | 1.75 | 00:01:42 | 0.00% | 0 | £0.00 |
1 | amazon | 3 | 3 | 3 | 100.00% | 1.00 | 00:00:00 | 0.00% | 0 | £0.00 |
2 | (not set) | 1 | 1 | 1 | 100.00% | 1.00 | 00:00:00 | 0.00% | 0 | £0.00 |
3 | miratrix | 1 | 1 | 1 | 0.00% | 2.00 | 00:11:37 | 0.00% | 0 | £0.00 |
ga_kw_traffic_data.head()
Day Index | Users | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-08-31 | 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2018-09-01 | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2018-09-02 | 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2018-09-03 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2018-09-04 | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# import google analytics pages data
ga_page_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_pages.csv",skiprows = 6,nrows =376)
ga_page_traffic_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_pages.csv",skiprows =386,nrows =367, parse_dates=['Day Index'])
ga_page_data.head()
Page | Pageviews | Unique Pageviews | Avg. Time on Page | Entrances | Bounce Rate | % Exit | Page Value | |
---|---|---|---|---|---|---|---|---|
0 | / | 813 | 665 | 00:02:05 | 641 | 14.35% | 55.97% | £0.00 |
1 | /www.miratrix.co.uk | 515 | 388 | 00:02:17 | 373 | 59.25% | 56.89% | £0.00 |
2 | /get-in-touch/ | 336 | 163 | 00:01:06 | 23 | 10.53% | 33.04% | £0.00 |
3 | /app-marketing-agency/ | 140 | 120 | 00:02:04 | 73 | 18.31% | 57.86% | £0.00 |
4 | /app-store-optimization-services/ | 136 | 105 | 00:01:30 | 22 | 14.29% | 47.79% | £0.00 |
ga_page_traffic_data.head()
Day Index | Pageviews | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | |
---|---|---|---|---|---|---|---|---|
0 | 2018-08-31 | 14 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2018-09-01 | 6 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2018-09-02 | 17 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2018-09-03 | 14 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2018-09-04 | 11 | NaN | NaN | NaN | NaN | NaN | NaN |
#Import google analytics social data
ga_social_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_social_data.csv", skiprows=5, nrows=7)
ga_social_traffic_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_social_data.csv", skiprows=15, nrows=366, parse_dates=['Day Index'])
ga_social_data.head()
Social Network | Users | New Users | Sessions | Bounce Rate | Pages / Session | Avg. Session Duration | Contact From Good Lead (Goal 1 Conversion Rate) | Contact From Good Lead (Goal 1 Completions) | Contact From Good Lead (Goal 1 Value) | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 29 | 29 | 29 | 37.93% | 1.00 | <00:00:01 | 0.00% | 0 | £0.00 | |
1 | 6 | 4 | 7 | 0.00% | 2.14 | 00:04:50 | 0.00% | 0 | £0.00 | |
2 | 2 | 2 | 2 | 100.00% | 1.00 | 00:00:00 | 0.00% | 0 | £0.00 | |
3 | 1 | 1 | 1 | 100.00% | 1.00 | 00:00:00 | 0.00% | 0 | £0.00 | |
4 | LiveJournal | 1 | 1 | 1 | 0.00% | 1.00 | 00:00:00 | 0.00% | 0 | £0.00 |
ga_social_traffic_data.head()
Day Index | Users | |
---|---|---|
0 | 2018-08-31 | 0 |
1 | 2018-09-01 | 0 |
2 | 2018-09-02 | 0 |
3 | 2018-09-03 | 0 |
4 | 2018-09-04 | 0 |
#Import google analytics other data
ga_other_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_other.csv", skiprows=6, nrows=5)
ga_other_traffic_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_other.csv", skiprows=13,
nrows=366, parse_dates=['Day Index'])
ga_other_data.head()
Source | Users | New Users | Sessions | Bounce Rate | Pages / Session | Avg. Session Duration | Contact From Good Lead (Goal 1 Conversion Rate) | Contact From Good Lead (Goal 1 Completions) | Contact From Good Lead (Goal 1 Value) | |
---|---|---|---|---|---|---|---|---|---|---|
0 | directory | 116 | 110 | 143 | 18.18% | 2.14 | 00:02:34 | 0.00% | 0 | £0.00 |
1 | google_business | 35 | 35 | 36 | 16.67% | 1.81 | 00:02:00 | 0.00% | 0 | £0.00 |
2 | nav | 26 | 0 | 43 | 6.98% | 2.40 | 00:04:04 | 0.00% | 0 | £0.00 |
3 | companyemail | 16 | 12 | 23 | 43.48% | 1.65 | 00:00:59 | 0.00% | 0 | £0.00 |
4 | Clickky DevTools Listing | 3 | 3 | 3 | 0.00% | 2.67 | 00:01:45 | 0.00% | 0 | £0.00 |
ga_other_traffic_data.head()
Day Index | Users | |
---|---|---|
0 | 2018-08-31 | 1 |
1 | 2018-09-01 | 0 |
2 | 2018-09-02 | 1 |
3 | 2018-09-03 | 1 |
4 | 2018-09-04 | 0 |
#Import google analytics total data
ga_total_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_total_channels.csv", skiprows=6, nrows=5)
ga_total_traffic_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_total_channels.csv", skiprows=13,
nrows=366, parse_dates=['Day Index'])
ga_total_data.head()
Channel Grouping | Users | New Users | Sessions | Bounce Rate | Pages / Session | Avg. Session Duration | Contact From Good Lead (Goal 1 Conversion Rate) | Contact From Good Lead (Goal 1 Completions) | Contact From Good Lead (Goal 1 Value) | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Organic Search | 622 | 598 | 734 | 36.10% | 1.75 | 00:01:43 | 0.00% | 0 | £0.00 |
1 | Direct | 601 | 599 | 665 | 57.74% | 1.37 | 00:00:42 | 0.00% | 0 | £0.00 |
2 | (Other) | 194 | 160 | 248 | 18.15% | 2.10 | 00:02:35 | 0.00% | 0 | £0.00 |
3 | Referral | 100 | 88 | 144 | 23.61% | 3.85 | 00:02:54 | 2.08% | 3 | £0.00 |
4 | Social | 41 | 39 | 42 | 35.71% | 1.19 | 00:00:49 | 0.00% | 0 | £0.00 |
ga_total_traffic_data.head()
Day Index | Users | |
---|---|---|
0 | 2018-08-31 | 5 |
1 | 2018-09-01 | 4 |
2 | 2018-09-02 | 5 |
3 | 2018-09-03 | 7 |
4 | 2018-09-04 | 7 |
#Importing query data from Google Search Console
gsc_query = pd.read_csv(r"C:\Users\jki\Downloads\gsc_data\gsc_query.csv")
gsc_query.head()
Query | Clicks | Impressions | CTR | Position | |
---|---|---|---|---|---|
0 | miratrix | 71 | 760 | 9.34% | 6.23 |
1 | app marketing agency uk | 23 | 398 | 5.78% | 3.36 |
2 | mobile marketing agency london | 11 | 2529 | 0.43% | 3.97 |
3 | app marketing uk | 9 | 261 | 3.45% | 2.01 |
4 | app marketing agency | 7 | 9860 | 0.07% | 39.21 |
#Importing pages data from Google Search Console
gsc_page = pd.read_csv(r"C:\Users\jki\Downloads\gsc_data\gsc_pages.csv")
gsc_page.head()
Page | Clicks | Impressions | CTR | Position | |
---|---|---|---|---|---|
0 | https://miratrix.co.uk/ | 371 | 100673 | 0.37% | 42.59 |
1 | https://miratrix.co.uk/app-marketing-agency/ | 80 | 30824 | 0.26% | 26.84 |
2 | https://miratrix.co.uk/mobile-app-competitor-a... | 16 | 1458 | 1.1% | 30.64 |
3 | https://miratrix.co.uk/blippar-the-message-isn... | 11 | 517 | 2.13% | 36.05 |
4 | https://miratrix.co.uk/author/nduddy/ | 10 | 140 | 7.14% | 5.61 |
#Importing country data from Google Search Console
gsc_country = pd.read_csv(r"C:\Users\jki\Downloads\gsc_data\gsc_country.csv")
gsc_country.head()
Country | Clicks | Impressions | CTR | Position | |
---|---|---|---|---|---|
0 | United Kingdom | 310 | 18836 | 1.65% | 48.67 |
1 | India | 97 | 25622 | 0.38% | 51.38 |
2 | United States | 29 | 14493 | 0.2% | 45.92 |
3 | Germany | 14 | 2516 | 0.56% | 55.01 |
4 | France | 12 | 2167 | 0.55% | 43.88 |
#Importing Device data from Google Search Console
#Importing query apperance from Google Search Consoleb
#Import PPC data
ppc_data = pd.read_csv(r"C:\Users\jki\Downloads\other_data\ppc_data.csv")
ppc_data.head()
Unnamed: 0 | Day | Ad group | Avg. CPC | CTR | Campaign | Clicks | Conv. rate | Conversions | Cost | Cost / conv. | Final URL | Impr. | Keyword | Keyword status | Max. CPC | Mobile final URL | Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2018-08-31 | miratrix_ppc_adgroup | 2.490563 | 16.894144 | miratrix_ppc | 75 | 0.225225 | 1 | 1105.809783 | 1105.809783 | Text | 7501 | keyword | Active | 2.6 | Text | Active |
1 | 1 | 2018-09-01 | miratrix_ppc_adgroup | 2.472161 | 5.976134 | miratrix_ppc | 17 | 1.193317 | 5 | 1035.835572 | 207.167114 | Text | 2504 | keyword | Active | 2.6 | Text | Active |
2 | 2 | 2018-09-02 | miratrix_ppc_adgroup | 2.508561 | 6.824919 | miratrix_ppc | 3 | 0.107411 | 1 | 2335.470042 | 2335.470042 | Text | 6354 | keyword | Active | 2.6 | Text | Active |
3 | 3 | 2018-09-03 | miratrix_ppc_adgroup | 2.548919 | 8.058366 | miratrix_ppc | 99 | 2.723735 | 7 | 655.072102 | 93.581729 | Text | 2071 | keyword | Active | 2.6 | Text | Active |
4 | 4 | 2018-09-04 | miratrix_ppc_adgroup | 2.414868 | 6.884937 | miratrix_ppc | 93 | 1.464435 | 7 | 1154.306794 | 164.900971 | Text | 3291 | keyword | Active | 2.6 | Text | Active |
#Import Facebook ad data
fb_data = pd.read_csv(r"C:\Users\jki\Downloads\other_data\facebook-data-ads.csv",
parse_dates=['Reporting starts','Reporting ends'])
fb_data.head()
Unnamed: 0 | Reporting starts | Reporting ends | Campaign name | Delivery | Mobile app installs | Mobile app actions | Cost per mobile app install (GBP) | Cost per mobile app action (GBP) | Desktop app installs | Desktop app engagement | Cost per desktop app install (GBP) | Cost per desktop app engagement (GBP) | Post reactions | Post comments | Post shares | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2014-02-21 | 2018-01-29 | Campaign1 | 0 | 11451.0 | NaN | 1.275683 | NaN | NaN | NaN | NaN | NaN | 1807.0 | 42.0 | 133.0 |
1 | 1 | 2015-12-04 | 2015-12-04 | Campaign2 | inactive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN |
2 | 2 | 2015-10-21 | 2015-10-21 | Campaign3 | inactive | 2.0 | NaN | 3.530000 | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN |
3 | 3 | 2015-10-20 | 2015-10-20 | Campaign3 | inactive | 2.0 | NaN | 16.500000 | NaN | NaN | NaN | NaN | NaN | 15.0 | NaN | NaN |
4 | 4 | 2015-10-20 | 2015-10-20 | Campaign4 | inactive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#import site vsiability csv
site_visibility = pd.read_csv(r"C:\Users\jki\Downloads\other_data\site_visibility.csv")
site_visibility.head()
Day Index | apptamin.com | miratrix.co.uk | |
---|---|---|---|
0 | 2018-08-31 | 35.354169 | 79.576190 |
1 | 2018-09-01 | 97.529284 | 45.990192 |
2 | 2018-09-02 | 8.643297 | 63.922349 |
3 | 2018-09-03 | 19.927875 | 54.022379 |
4 | 2018-09-04 | 47.617416 | 57.075055 |
ranking_data = pd.read_csv(r"C:\Users\jki\Downloads\other_data\ranking_data.csv")
ranking_data.head()
Day Index | app competitor analysis | app marketing agency | app marketing agency london | app marketing agency uk | app marketing uk | app store optimisation agency | aso audit | mobile advertising agency | mobile advertising agency london | mobile app marketing agency | mobile marketing agency | mobile marketing agency london | mobile marketing services uk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-08-31 | 6 | 1 | 6 | 10 | 21 | 20 | 17 | 26 | 0 | 21 | 23 | 9 | 1 |
1 | 2018-09-01 | 21 | 5 | 12 | 8 | 1 | 28 | 12 | 16 | 18 | 7 | 0 | 7 | 7 |
2 | 2018-09-02 | 22 | 7 | 14 | 29 | 19 | 16 | 14 | 10 | 10 | 28 | 10 | 4 | 6 |
3 | 2018-09-03 | 28 | 28 | 29 | 23 | 0 | 10 | 11 | 8 | 27 | 21 | 3 | 11 | 27 |
4 | 2018-09-04 | 14 | 14 | 26 | 17 | 11 | 26 | 13 | 17 | 11 | 22 | 8 | 28 | 0 |
ranking_data[['Day Index','app competitor analysis']].head()
Day Index | app competitor analysis | |
---|---|---|
0 | 2018-08-31 | 6 |
1 | 2018-09-01 | 21 |
2 | 2018-09-02 | 22 |
3 | 2018-09-03 | 28 |
4 | 2018-09-04 | 14 |
In this section we're going to see how to use an opensource API to gather data that we can use in our strategic analysis. IMPORTANT: Curretnly this API will only work with pandas .23.4
. This will change in the future. https://github.com/GeneralMills/pytrends thanks for General Mills for creating this handy repository.
#Lets create a variable that stores the API settings
pytrends = TrendReq(hl='en-GB',
tz=360, #timezone
timeout=(10,25),#time out after afew seconts
retries=2, #retries 2 times
backoff_factor=0.1) #stops after a few attempts
#Create a list with 5 keywords you'd like to investigate
kw_list = ["seo audit", "aso audit", "app marketing agency","seo agency", "aso agency"]
It's possible to change the category, locations, regions. You can find your category here https://github.com/pat310/google-trends-api/wiki/Google-Trends-Categories
#Pass your variables over to Google Trends
pytrends.build_payload(kw_list, #keywords to lookup
cat=83, #category to look in
timeframe='today 5-y', #timeframe
geo='', #country
gprop='')#region
!pip install retrying
Collecting retrying Obtaining dependency information for retrying from https://files.pythonhosted.org/packages/8f/04/9e36f28be4c0532c0e9207ff9dc01fb13a2b0eb036476a213b0000837d0e/retrying-1.3.4-py3-none-any.whl.metadata Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB) Requirement already satisfied: six>=1.7.0 in c:\users\jki\anaconda3\lib\site-packages (from retrying) (1.16.0) Downloading retrying-1.3.4-py3-none-any.whl (11 kB) Installing collected packages: retrying Successfully installed retrying-1.3.4
import pandas as pd
from retrying import retry
@retry(stop_max_attempt_number=5) # Retry for a maximum of 5 times
def fetch_trends_data():
trends_data = pytrends.interest_over_time()
return trends_data
try:
trends_data = fetch_trends_data()
# Convert the trends data to a Pandas DataFrame
trends_df = pd.DataFrame(trends_data)
print(trends_df)
except Exception as e:
print("Error fetching trends data:", e)
seo audit aso audit app marketing agency seo agency \ date 2019-03-10 0 0 25 27 2019-03-17 0 29 23 58 2019-03-24 0 0 0 49 2019-03-31 0 0 0 73 2019-04-07 0 0 0 91 ... ... ... ... ... 2024-02-11 0 0 0 83 2024-02-18 0 0 0 61 2024-02-25 0 0 0 78 2024-03-03 0 0 0 0 2024-03-10 0 0 0 0 aso agency isPartial date 2019-03-10 0 False 2019-03-17 0 False 2019-03-24 0 False 2019-03-31 0 False 2019-04-07 0 False ... ... ... 2024-02-11 0 False 2024-02-18 0 False 2024-02-25 0 False 2024-03-03 0 False 2024-03-10 0 True [262 rows x 6 columns]
trends_data.plot()
<Axes: xlabel='date'>
In this section you'll be visualising the data we loaded previously in this notebook using Pandas built in plot.()
function. The function helps you create visualisations of simple data quickly without the need to load other libraries.
#Visualise page data using a bar chart sort the chart by pageviews
ga_page_data[:10].sort_values(by='Pageviews', ascending=False).plot.bar(figsize=(15,5))
<Axes: >
#Plot another bar chart, give it a secondary y axes as Impressions and stack the bar chart
gsc_query[:10].plot.bar(figsize=(25,5), secondary_y="Impressions", stacked=True)
<Axes: >
#Plot another bar chart but this time increase the font size
gsc_country[['Country','Clicks']][:10].plot.bar(figsize=(15,4), fontsize=14)
<Axes: >
ppc_data['Cost'][:10].plot(kind='line', figsize=(15,5))
<Axes: >
fb_data.head()
Unnamed: 0 | Reporting starts | Reporting ends | Campaign name | Delivery | Mobile app installs | Mobile app actions | Cost per mobile app install (GBP) | Cost per mobile app action (GBP) | Desktop app installs | Desktop app engagement | Cost per desktop app install (GBP) | Cost per desktop app engagement (GBP) | Post reactions | Post comments | Post shares | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2014-02-21 | 2018-01-29 | Campaign1 | 0 | 11451.0 | NaN | 1.275683 | NaN | NaN | NaN | NaN | NaN | 1807.0 | 42.0 | 133.0 |
1 | 1 | 2015-12-04 | 2015-12-04 | Campaign2 | inactive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN |
2 | 2 | 2015-10-21 | 2015-10-21 | Campaign3 | inactive | 2.0 | NaN | 3.530000 | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN |
3 | 3 | 2015-10-20 | 2015-10-20 | Campaign3 | inactive | 2.0 | NaN | 16.500000 | NaN | NaN | NaN | NaN | NaN | 15.0 | NaN | NaN |
4 | 4 | 2015-10-20 | 2015-10-20 | Campaign4 | inactive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Scatter charts are useful to look for relationship patterns, create on between two metrics
fb_data.plot.scatter(x='Mobile app installs',
y='Cost per mobile app install (GBP)',figsize=(15,5))
<Axes: xlabel='Mobile app installs', ylabel='Cost per mobile app install (GBP)'>
#Plot two scatter graphs on top of each other and format it
ax = fb_data.plot.scatter(x='Post reactions',y='Cost per mobile app install (GBP)',
color='Magenta', label='Group 1',
alpha=.6, s=fb_data['Cost per mobile app install (GBP)']*5,
figsize=(15,5))
fb_data.plot.scatter(x='Mobile app installs',y='Cost per mobile app install (GBP)',
color='Cyan',
label='Group 2', ax=ax, alpha=.5,
figsize=(15,5))
<Axes: xlabel='Mobile app installs', ylabel='Cost per mobile app install (GBP)'>
#Plot a line graph with a secondary y of the largest value
trends_data.plot(secondary_y=('seo agency'), figsize=(15,5))
<Axes: xlabel='date'>
#Now plot a chart of the cummulative sum of all the data
trends_data.cumsum().plot(figsize=(15,5))
<Axes: xlabel='date'>
In order to use your marketing data correctly it needs to be in the right format to run calculations, draw charts and marke predictions. Gettting your data in the correct order is refered to as data wrangling.
In the section you will:
import pandas as pd # Lodad the pandas library
#Import google analytics pages data
ga_page_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_pages.csv", skiprows=6, nrows=376)
ga_page_traffic_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_pages.csv",
skiprows=386, nrows=366, parse_dates=['Day Index'])
#Importing pages data from Google Search Console
gsc_page = pd.read_csv(r"C:\Users\jki\Downloads\gsc_data\gsc_pages.csv")
#Import Facebook ad data
fb_data = pd.read_csv(r"C:\Users\jki\Downloads\other_data\facebook-data-ads.csv",
parse_dates=['Reporting starts','Reporting ends'])
We know that some of the data being gathered in Google Analytics is wrong. Google Analytics has been gathering homepage data under two page, these pages are /
and /www.miratrix.co.uk
.
Also during this process we will want to create a URL that can be paired with data gathered from Google Search Console. I'll split the data on '/' to allow me to break the URL into three parts. '/' denotes a breaks in the URL. Then I will rebuild the URL in the order which matches the GSC data URL's.
#Look at Google Analytics data
ga_page_data.head()
Page | Pageviews | Unique Pageviews | Avg. Time on Page | Entrances | Bounce Rate | % Exit | Page Value | |
---|---|---|---|---|---|---|---|---|
0 | / | 813 | 665 | 00:02:05 | 641 | 14.35% | 55.97% | £0.00 |
1 | /www.miratrix.co.uk | 515 | 388 | 00:02:17 | 373 | 59.25% | 56.89% | £0.00 |
2 | /get-in-touch/ | 336 | 163 | 00:01:06 | 23 | 10.53% | 33.04% | £0.00 |
3 | /app-marketing-agency/ | 140 | 120 | 00:02:04 | 73 | 18.31% | 57.86% | £0.00 |
4 | /app-store-optimization-services/ | 136 | 105 | 00:01:30 | 22 | 14.29% | 47.79% | £0.00 |
#Split the url by /
split_dom = ga_page_data.Page.str.rpartition("/")
split_dom.head()
0 | 1 | 2 | |
---|---|---|---|
0 | / | ||
1 | / | www.miratrix.co.uk | |
2 | /get-in-touch | / | |
3 | /app-marketing-agency | / | |
4 | /app-store-optimization-services | / |
split_dom[2].replace("", "miratrix.co.uk", inplace=True)
#Change URLformat so that it matches other data sources
split_dom[2].replace("www.miratrix.co.uk", "miratrix.co.uk", inplace=True)
split_dom.head()
0 | 1 | 2 | |
---|---|---|---|
0 | / | miratrix.co.uk | |
1 | / | miratrix.co.uk | |
2 | /get-in-touch | / | miratrix.co.uk |
3 | /app-marketing-agency | / | miratrix.co.uk |
4 | /app-store-optimization-services | / | miratrix.co.uk |
#Build the URL and replace current URL's with new URL format
ga_page_data.Page = "https://" + split_dom[2] + split_dom[0] + "/"
ga_page_data.head()
Page | Pageviews | Unique Pageviews | Avg. Time on Page | Entrances | Bounce Rate | % Exit | Page Value | |
---|---|---|---|---|---|---|---|---|
0 | https://miratrix.co.uk/ | 813 | 665 | 00:02:05 | 641 | 14.35% | 55.97% | £0.00 |
1 | https://miratrix.co.uk/ | 515 | 388 | 00:02:17 | 373 | 59.25% | 56.89% | £0.00 |
2 | https://miratrix.co.uk/get-in-touch/ | 336 | 163 | 00:01:06 | 23 | 10.53% | 33.04% | £0.00 |
3 | https://miratrix.co.uk/app-marketing-agency/ | 140 | 120 | 00:02:04 | 73 | 18.31% | 57.86% | £0.00 |
4 | https://miratrix.co.uk/app-store-optimization-... | 136 | 105 | 00:01:30 | 22 | 14.29% | 47.79% | £0.00 |
In order to fix the issue of having two homepage URL (and possibly more) we will need to group the data by Page. This poses some issues as not all the data are numerical and you are also dealing with a timeseries.
What can be done is to change the object fields to numerical and to convert the time on page column to a timedelta.
ga_page_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 376 entries, 0 to 375 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Page 376 non-null object 1 Pageviews 376 non-null int64 2 Unique Pageviews 376 non-null int64 3 Avg. Time on Page 376 non-null object 4 Entrances 376 non-null int64 5 Bounce Rate 376 non-null object 6 % Exit 376 non-null object 7 Page Value 376 non-null object dtypes: int64(3), object(5) memory usage: 23.6+ KB
#Replace 0 with time format 00:00:00
ga_page_data['Avg. Time on Page'] = ga_page_data['Avg. Time on Page'].replace(0, '00:00:00')
#Convert column from string to time delta
ga_page_data['Avg. Time on Page'] = pd.to_timedelta(ga_page_data['Avg. Time on Page'])
#Replace % with nothing
ga_page_data['Bounce Rate'] = ga_page_data['Bounce Rate'].str.replace('%',"")
#Convert the column into a Float
ga_page_data['Bounce Rate'] = ga_page_data['Bounce Rate'].astype('float')
#Replace % with nothing
ga_page_data['% Exit'] = ga_page_data['% Exit'].str.replace('%',"")
#Convert the column into a Float
ga_page_data['% Exit'] = ga_page_data['% Exit'].astype('float')
#Replace % with nothing
ga_page_data['Page Value'] = ga_page_data['Page Value'].str.replace('£',"")
#Convert the column into a Float
ga_page_data['Page Value'] = ga_page_data['Page Value'].astype('float')
#Set the index for the dataframe to Page
ga_page_data.set_index('Page',inplace=True)
#Check the data to make sure the data types have changed
ga_page_data.info()
<class 'pandas.core.frame.DataFrame'> Index: 376 entries, https://miratrix.co.uk/ to https://www.miratrix.co.uk#utm_source=nav&utm_medium=traffic&utm_campaign=datatools/ Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pageviews 376 non-null int64 1 Unique Pageviews 376 non-null int64 2 Avg. Time on Page 376 non-null timedelta64[ns] 3 Entrances 376 non-null int64 4 Bounce Rate 376 non-null float64 5 % Exit 376 non-null float64 6 Page Value 376 non-null float64 dtypes: float64(3), int64(3), timedelta64[ns](1) memory usage: 23.5+ KB
#Groupby Index and get the mean of the 'Avg. Time on Page'
avg_time = ga_page_data.groupby(ga_page_data.index)['Avg. Time on Page'].mean(numeric_only=False)
#Convert avg_time to dataframe and store as ga_Time
ga_time = pd.DataFrame(avg_time)
#Groupby bounce, exit, pageviews and Unique pageviews using the appropriate math
#and store in a dataframe. Save as ga_bou_ex_type
ga_bou_ex_type = pd.DataFrame(ga_page_data.groupby(ga_page_data.index).agg({
'Bounce Rate': ['mean'], "% Exit": ['mean'],'Pageviews': sum, 'Unique Pageviews': sum}))
#Check your time groupby
ga_time['Avg. Time on Page'].head()
Page https://#utm_source=nav&utm_medium=traffic&utm_campaign=datatools/ 0 days 00:00:19 https://#utm_source=nav&utm_medium=traffic&utm_campaign=datatools/app-marketing-agency/ 0 days 00:00:04 https://+&cd=10&hl=en&ct=clnk&gl=in/search?q=cache:bI4Yk20ZHV8J:https://miratrix.co.uk/get-in-touch/ 0 days 00:00:00 https://+&cd=5&hl=en&ct=clnk&gl=uk/search?q=cache:CAP7TiQpKL4J:https://miratrix.co.uk/tag/bridgnorth-shoplifters/ 0 days 00:00:00 https://39/blog/page/ 0 days 00:00:00 Name: Avg. Time on Page, dtype: timedelta64[ns]
#Check your aggreated groupby
ga_bou_ex_type.info()
<class 'pandas.core.frame.DataFrame'> Index: 316 entries, https://#utm_source=nav&utm_medium=traffic&utm_campaign=datatools/ to https://xFRAx2jfTOmg+6EPY+09e3SpRN05tZm6PA=&b=5&f=frame&u=quh9O6fk3cp+g1Q0B84ejuJAQEZJll4HdKCQfOVJedw=&b=5/go.php?u=quh9O7m+gcpnkEcjR8wG07lIAEwDjhYPfuuLYf1GaJpICjY4ebQy5itrwJGI8mdy/xFTBhuneDOmg+6EPY+09e3SpRN05tZo5/w=&b=5&f=frame&u=quh9O7m+gcpnkEcjR8wG07lIAEwDjhYPfuuLYf1GaJpICjY4ebQy5itrwJGI8mdy/ Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 (Bounce Rate, mean) 316 non-null float64 1 (% Exit, mean) 316 non-null float64 2 (Pageviews, sum) 316 non-null int64 3 (Unique Pageviews, sum) 316 non-null int64 dtypes: float64(2), int64(2) memory usage: 12.3+ KB
Here we will rebuild the Google Analytics dataset with the fixed data for use later.
#Merge ga_time and ga_bou_ex_type on page and with an inner join
new_ga = pd.merge(ga_time, #left dataset
ga_bou_ex_type, #right dataset
how='inner', # we're joning using the data in the column
left_on="Page", # the left dataset is neing joined on the column Page
right_on="Page" # the right dataset is neing joined on the column Page
).sort_values(by="Avg. Time on Page" ,ascending=False) #sort the new dataframe
C:\Users\jki\AppData\Local\Temp\ipykernel_1124\1241091563.py:2: FutureWarning: merging between different levels is deprecated and will be removed in a future version. (1 levels on the left, 2 on the right) new_ga = pd.merge(ga_time, #left dataset
new_ga.head()
Avg. Time on Page | (Bounce Rate, mean) | (% Exit, mean) | (Pageviews, sum) | (Unique Pageviews, sum) | |
---|---|---|---|---|---|
Page | |||||
https://channels/channel/UCAQfRNzXGD4BQICkO1KQZUA/ | 0 days 00:21:45 | 0.0 | 0.0 | 1 | 1 |
https://miratrix.co.uk/top-3-competitor-analysis-for-aso/ | 0 days 00:18:09.500000 | 50.0 | 87.5 | 5 | 5 |
https://www.miratrix.co.uk?SuperSocializerAuth=LiveJournal/blog/page/30/ | 0 days 00:14:57 | 0.0 | 0.0 | 1 | 1 |
https://about/channel/UCAQfRNzXGD4BQICkO1KQZUA/ | 0 days 00:14:45 | 0.0 | 0.0 | 2 | 2 |
https://videos?view=0&sort=p&flow=grid/channel/UCAQfRNzXGD4BQICkO1KQZUA/ | 0 days 00:14:43 | 0.0 | 0.0 | 1 | 1 |
#rename (Bounce Rate, mean), (% Exit, mean), (Pageviews, sum), (Unique Pageviews, sum)
new_ga = new_ga.rename(columns={('Bounce Rate', 'mean'): 'Bounce Rate', ('% Exit', 'mean') : 'Exit',
('Pageviews', 'sum') : 'Pageviews',
('Unique Pageviews', 'sum') : 'Unique Pageviews'})
#Check the rename worked
new_ga.head()
Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | |
---|---|---|---|---|---|
Page | |||||
https://channels/channel/UCAQfRNzXGD4BQICkO1KQZUA/ | 0 days 00:21:45 | 0.0 | 0.0 | 1 | 1 |
https://miratrix.co.uk/top-3-competitor-analysis-for-aso/ | 0 days 00:18:09.500000 | 50.0 | 87.5 | 5 | 5 |
https://www.miratrix.co.uk?SuperSocializerAuth=LiveJournal/blog/page/30/ | 0 days 00:14:57 | 0.0 | 0.0 | 1 | 1 |
https://about/channel/UCAQfRNzXGD4BQICkO1KQZUA/ | 0 days 00:14:45 | 0.0 | 0.0 | 2 | 2 |
https://videos?view=0&sort=p&flow=grid/channel/UCAQfRNzXGD4BQICkO1KQZUA/ | 0 days 00:14:43 | 0.0 | 0.0 | 1 | 1 |
Sometimes we end up with data that we don't need that we want to drop out of our dataframe. In this tutuorial you'll learn how to drop columns from Google Analytics and Facebook ads data.
#Check the Google Analytics Page data
ga_page_traffic_data.head()
Day Index | Pageviews | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | |
---|---|---|---|---|---|---|---|---|
0 | 2018-08-31 | 14 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2018-09-01 | 6 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2018-09-02 | 17 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2018-09-03 | 14 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2018-09-04 | 11 | NaN | NaN | NaN | NaN | NaN | NaN |
#Drop NaN values from the dataset
ga_page_traffic_data.dropna(axis=1, inplace=True)
#Check the dropna worked
ga_page_traffic_data.head()
Day Index | Pageviews | |
---|---|---|
0 | 2018-08-31 | 14 |
1 | 2018-09-01 | 6 |
2 | 2018-09-02 | 17 |
3 | 2018-09-03 | 14 |
4 | 2018-09-04 | 11 |
#Check the Facebook Ads data
fb_data.head()
Unnamed: 0 | Reporting starts | Reporting ends | Campaign name | Delivery | Mobile app installs | Mobile app actions | Cost per mobile app install (GBP) | Cost per mobile app action (GBP) | Desktop app installs | Desktop app engagement | Cost per desktop app install (GBP) | Cost per desktop app engagement (GBP) | Post reactions | Post comments | Post shares | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2014-02-21 | 2018-01-29 | Campaign1 | 0 | 11451.0 | NaN | 1.275683 | NaN | NaN | NaN | NaN | NaN | 1807.0 | 42.0 | 133.0 |
1 | 1 | 2015-12-04 | 2015-12-04 | Campaign2 | inactive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN |
2 | 2 | 2015-10-21 | 2015-10-21 | Campaign3 | inactive | 2.0 | NaN | 3.530000 | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN |
3 | 3 | 2015-10-20 | 2015-10-20 | Campaign3 | inactive | 2.0 | NaN | 16.500000 | NaN | NaN | NaN | NaN | NaN | 15.0 | NaN | NaN |
4 | 4 | 2015-10-20 | 2015-10-20 | Campaign4 | inactive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Drop all columns that have NaN
fb_data.drop('Unnamed: 0', axis=1, inplace=True)
#Check the drop worked
fb_data.head()
Reporting starts | Reporting ends | Campaign name | Delivery | Mobile app installs | Mobile app actions | Cost per mobile app install (GBP) | Cost per mobile app action (GBP) | Desktop app installs | Desktop app engagement | Cost per desktop app install (GBP) | Cost per desktop app engagement (GBP) | Post reactions | Post comments | Post shares | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2014-02-21 | 2018-01-29 | Campaign1 | 0 | 11451.0 | NaN | 1.275683 | NaN | NaN | NaN | NaN | NaN | 1807.0 | 42.0 | 133.0 |
1 | 2015-12-04 | 2015-12-04 | Campaign2 | inactive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN |
2 | 2015-10-21 | 2015-10-21 | Campaign3 | inactive | 2.0 | NaN | 3.530000 | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN |
3 | 2015-10-20 | 2015-10-20 | Campaign3 | inactive | 2.0 | NaN | 16.500000 | NaN | NaN | NaN | NaN | NaN | 15.0 | NaN | NaN |
4 | 2015-10-20 | 2015-10-20 | Campaign4 | inactive | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
We cans see from the Facebook Ad data that there are a lot of NaN (Not A Number) values, these values are missing data. In order to visualise this data we will need to replace this data with a number. In this case we will use 0.
fb_data.fillna(0,inplace=True)
#Check that fillna worked
fb_data.head()
Reporting starts | Reporting ends | Campaign name | Delivery | Mobile app installs | Mobile app actions | Cost per mobile app install (GBP) | Cost per mobile app action (GBP) | Desktop app installs | Desktop app engagement | Cost per desktop app install (GBP) | Cost per desktop app engagement (GBP) | Post reactions | Post comments | Post shares | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2014-02-21 | 2018-01-29 | Campaign1 | 0 | 11451.0 | 0.0 | 1.275683 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1807.0 | 42.0 | 133.0 |
1 | 2015-12-04 | 2015-12-04 | Campaign2 | inactive | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 |
2 | 2015-10-21 | 2015-10-21 | Campaign3 | inactive | 2.0 | 0.0 | 3.530000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 |
3 | 2015-10-20 | 2015-10-20 | Campaign3 | inactive | 2.0 | 0.0 | 16.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 15.0 | 0.0 | 0.0 |
4 | 2015-10-20 | 2015-10-20 | Campaign4 | inactive | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
In order to extract more value from your datasets you can join them together so that you can view the trends all in one place. Pandas has various methods for joining data in this case you will use pd.merge
.
#You pandas merge to join Google Console and Google Analytics data
ga_gsc_data = pd.merge(gsc_page, #left data
new_ga, #right data
how='inner',
left_on="Page",
right_index=True)
#Check it worked
ga_gsc_data.head()
Page | Clicks | Impressions | CTR | Position | Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | |
---|---|---|---|---|---|---|---|---|---|---|
0 | https://miratrix.co.uk/ | 371 | 100673 | 0.37% | 42.59 | 0 days 00:01:43.666666666 | 47.296667 | 54.286667 | 1396 | 1095 |
1 | https://miratrix.co.uk/app-marketing-agency/ | 80 | 30824 | 0.26% | 26.84 | 0 days 00:01:45.333333333 | 64.436667 | 60.913333 | 210 | 181 |
2 | https://miratrix.co.uk/mobile-app-competitor-a... | 16 | 1458 | 1.1% | 30.64 | 0 days 00:04:06 | 53.335000 | 89.475000 | 30 | 26 |
3 | https://miratrix.co.uk/blippar-the-message-isn... | 11 | 517 | 2.13% | 36.05 | 0 days 00:00:22.333333333 | 58.170000 | 78.333333 | 27 | 21 |
4 | https://miratrix.co.uk/author/nduddy/ | 10 | 140 | 7.14% | 5.61 | 0 days 00:02:52 | 22.223333 | 38.890000 | 15 | 13 |
Saving data in pandas is super easy. We'll being using .to_csv() to save the augement and new datasets for use later.
#Save the newly created Page dataset
ga_gsc_data.to_csv('ga_gsc_data.csv')
#Save the amended facebook dataset
fb_data.to_csv('fixed_fb_data.csv')
Visualisation is a huge part of what I do as a marketer. Visualising data helps me spot trends, problems and oppertunities. Having strong visualisation skills will also help you to communicate your data driven ideas and plans to your boss\team\clients\stakehodlers.
By the end of this section you I will haveshown you how to draw, augment and stylize your visualizations using Pandas
, Matplotlib
and Seaborn
.
import pandas as pd # importing pandas library
import matplotlib.pyplot as plt #importing matplotlib library
import seaborn as sns #importing seaborn package
%matplotlib inline
import warnings #for managing error message
warnings.filterwarnings('ignore')
ga_gsc_data = pd.read_csv('ga_gsc_data.csv',
usecols=['Page', 'Clicks', 'Impressions', 'CTR', 'Position',
'Avg. Time on Page', 'Bounce Rate', 'Exit', 'Pageviews',
'Unique Pageviews'])
#View the make up of the data using .info()
ga_gsc_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 193 entries, 0 to 192 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Page 193 non-null object 1 Clicks 193 non-null int64 2 Impressions 193 non-null int64 3 CTR 193 non-null object 4 Position 193 non-null float64 5 Avg. Time on Page 193 non-null object 6 Bounce Rate 193 non-null float64 7 Exit 193 non-null float64 8 Pageviews 193 non-null int64 9 Unique Pageviews 193 non-null int64 dtypes: float64(3), int64(4), object(3) memory usage: 15.2+ KB
#Have a look at the data
ga_gsc_data.head()
Page | Clicks | Impressions | CTR | Position | Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | |
---|---|---|---|---|---|---|---|---|---|---|
0 | https://miratrix.co.uk/ | 371 | 100673 | 0.37% | 42.59 | 0 days 00:01:43.666666666 | 47.296667 | 54.286667 | 1396 | 1095 |
1 | https://miratrix.co.uk/app-marketing-agency/ | 80 | 30824 | 0.26% | 26.84 | 0 days 00:01:45.333333333 | 64.436667 | 60.913333 | 210 | 181 |
2 | https://miratrix.co.uk/mobile-app-competitor-a... | 16 | 1458 | 1.1% | 30.64 | 0 days 00:04:06 | 53.335000 | 89.475000 | 30 | 26 |
3 | https://miratrix.co.uk/blippar-the-message-isn... | 11 | 517 | 2.13% | 36.05 | 0 days 00:00:22.333333333 | 58.170000 | 78.333333 | 27 | 21 |
4 | https://miratrix.co.uk/author/nduddy/ | 10 | 140 | 7.14% | 5.61 | 0 days 00:02:52 | 22.223333 | 38.890000 | 15 | 13 |
You can see from the info table above that we have some Series which should be numerical or time but are instead are Objects. Before visualising you must transform this data into the correct data type.
#Remove the % sign by using string replace
ga_gsc_data.CTR = ga_gsc_data.CTR.str.replace('%','')
#Replace 0 days with nothing and convert the remaining string to a timedelta
ga_gsc_data['Avg. Time on Page'] = ga_gsc_data['Avg. Time on Page'].str.replace('0 days ','').apply(pd.Timedelta)
#Set Page as index
ga_gsc_data.set_index('Page', inplace=True)
#Plot a basic bar chart showing the statstical breakdown the data set
ga_gsc_data.describe().plot.bar(figsize=(15,5), secondary_y='Impressions')
<Axes: >
ga_gsc_data[['Clicks', 'Pageviews', 'Position']][:20].plot.bar(figsize=(15,5),
secondary_y=('Position'))
<Axes: xlabel='Page'>
Matplotlib is the underlying plotting library for Pandas. Many of the more recent tools create for visualising data are built on top of matplotlib, including Seaborn a popular library that we'll investigate later in this notebook. In this section you'll use functions from matplotlib to customise charts for presentations.
## Creating matplotlib subplots
Subplots are a common method for drawing two plots either side by side as an overlay. You will see this concept depicted in many python plotting tutorials and it's good to understand how the functions work.
#Create subplots of two Series of data from ga_gsc_data
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(121) # Create matplotlib axes (nrows, ncolumns, postion)
ax2 = fig.add_subplot(122) # Create another axes
#first plot
ga_gsc_data['Clicks'][:10].plot.bar(ax=ax #set the matplotlib axes to ax
)
#second plot
ga_gsc_data['Pageviews'][:10].plot.bar(ax=ax2 #set the matplotlib axes to ax2
)
plt.show() #display the charts
In many occasions in marketing you will need to plot a seconday y axes. This is because many datasets have one column of data much larger than any other column. This makes it very difficult to understand the relationships between data points as one dwarfs the rest of the data. In this video you will learn how to plot a seconday y access and overlay two subplots onto a single chart.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
width = 0.2 #the width of the bars
ga_gsc_data['Clicks'][:10].plot.bar(color='red', #colour of bar
ax=ax, #what matplotlinb axes to use
width=width, #what matplotlinb axes to use
position=1)#the postion of the bar relative to the others
ga_gsc_data['Pageviews'][:10].plot(kind='bar', #type of plot
color='blue', #colour of bar
ax=ax2, #what matplotlinb axes to use
width=width,#what matplotlinb axes to use
position=0)#the postion of the bar relative to the others
#display the charts
plt.show()
## Adding x and y labels to a plot
We may need to add labels to our x and y axes to ensure clarity of a report and that those not familiar with the topic can keep up with what is going on. In this tutorial you'll learn how to do this.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
width = 0.2 #set the width of the bars
ga_gsc_data['Clicks'][:10].plot(kind='bar', #type of chart to use
color='yellow', #colour of bar
ax=ax, #what matplotlinb axes to use
width=width, #what matplotlinb axes to use
position=1, #the postion of the bar relative to the others
fontsize=10) #set font size
ga_gsc_data['Pageviews'][:10].plot(kind='bar',
color='lightgreen',
ax=ax2,
width=width,
position=0)
#Create ax customatisations
ax.set_xlabel('Page') #name the x axis
ax.set_ylabel('Clicks') # name the primary y axis
#Create ax2 customatisations
ax2.set_ylabel('Pageviews') #name the secondary y axis
#display the charts
plt.show()
Sometimes you may want to change the position of your ticks to make them more readable. This is particularly apperent in this example when we're dealing with URL strings as the xtick label. In this video you'll learn how to change the angle of which the labels are displayed.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
width = 0.2 #set the width of the bars
ga_gsc_data['Clicks'][:10].plot(kind='bar', #type of chart to use
color='yellow', #colour of bar
ax=ax, #what matplotlinb axes to use
width=width, #set the bar width
position=1, #the postion of the bar relative to the others
fontsize=10) #set font size
ga_gsc_data['Pageviews'][:10].plot(kind='bar',
color='lightgreen',
ax=ax2,
width=width,
position=0)
#Create ax customatisations
ax.set_xlabel('Page') #name the x axis
ax.set_ylabel('Clicks') # name the primary y axis
ax.set_xticklabels(ax.get_xticklabels(), #get a strings with the label names
rotation=45) #Rotate the xticks labels by degrees
#Create ax2 customatisations
ax2.set_ylabel('Pageviews') #name the secondary y axis
#display the charts
plt.show()
When creating plots using subplots you will need to explicitly set the legend for the figure. if you are calling multiple data points in and they are not labelled correctly the person reading the plot won't know what you're referencing. This will make communicating your needs and plans difficult.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
width = 0.2 #set the width of the bars
ga_gsc_data['Clicks'][:10].plot(kind='bar', #type of chart to use
color='yellow', #colour of bar
ax=ax, #what matplotlinb axes to use
width=width, #set the bar width
position=1, #the postion of the bar relative to the others
fontsize=10) #set font size
ga_gsc_data['Pageviews'][:10].plot(kind='bar',
color='lightgreen',
ax=ax2,
width=width,
position=0)
#Create ax customatisations
ax.set_xlabel('Page') #name the x axis
ax.set_ylabel('Clicks') # name the primary y axis
ax.set_xticklabels(ax.get_xticklabels(), #get a strings with the label names
rotation=45) #Rotate the xticks labels by degrees
#Create ax2 customatisations
ax2.set_ylabel('Pageviews') #name the secondary y axis
#Global Plot settings
fig.legend() # set the legend
#display the charts
plt.show()
Properly labeling your chart so that a viewer can understand immedately what's being communicated is important for you and your organisation. You can easily create the chart name by plotting the title.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
width = 0.2 #set the width of the bars
ga_gsc_data['Clicks'][:10].plot(kind='bar', #type of chart to use
color='yellow', #colour of bar
ax=ax, #what matplotlinb axes to use
width=width, #set the bar width
position=1, #the postion of the bar relative to the others
fontsize=10) #set font size
ga_gsc_data['Pageviews'][:10].plot(kind='bar',
color='lightgreen',
ax=ax2,
width=width,
position=0)
#Create ax customatisations
ax2.set_ylabel('Pageviews') #name the secondary y axis
#Global Plot settings
plt.title('Bar Chart: Clicks vs Pageview') # set chart name
fig.legend() # set the legend
#display the charts
plt.show()
This is a great way of communicating to others clearly that "PEOPLE THE PROBLEM IS RIGHT HERE!". Using plot annotations allows you to be very specific on what parts of a chart you want viewers to pay attention to and it's pretty easy to do.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
width = 0.2 #set the width of the bars
ga_gsc_data['Clicks'][:10].plot(kind='bar', #type of chart to use
color='yellow', #colour of bar
ax=ax, #what matplotlinb axes to use
width=width, #set the bar width
position=1, #the postion of the bar relative to the others
fontsize=10) #set font size
ga_gsc_data['Pageviews'][:10].plot(kind='bar',
color='lightgreen',
ax=ax2,
width=width,
position=0)
#Create ax customatisations
ax.annotate('Lots of Pageviews but few clicks',
xy=(8, 50),
xytext=(2, 120),
arrowprops=dict(facecolor='cyan', #colour
shrink=0.05, #length of arrow
lw=1, #line width
ec='magenta', #boarder colour
zorder=1) #layering order of annotation
)
ax.set_xlabel('Page') #name the x axis
ax.set_ylabel('Clicks') # name the primary y axis
ax.set_xticklabels(ax.get_xticklabels(), #get a strings with the label names
rotation=45) #Rotate the xticks labels by degrees
#Create ax customatisations
ax2.annotate('Lots of Pageviews and clicks',
xy=(0, 1398),
xytext=(1, 1398),
arrowprops=dict(facecolor='cyan', #colour
shrink=0.05, #length of arrow
lw=1, #line width
ec='magenta', #boarder colour
zorder=1) #layering order of annotation
)
ax2.set_ylabel('Pageviews') #name the secondary y axis
#Global Plot settings
plt.title('Bar Chart: Clicks vs Pageview') # set chart name
fig.legend() # set the legend
#display the charts
plt.show()
If you're like me and you don't want to think about colour schemes too much then matplotlib has a solution for you. They've create a bunch of prebaked chart style sheets which you can switch between. This takes the thought out of style so that you can focus on the key points you want to communicate.
plt.style.use('seaborn-pastel') #set a style sheet
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
width = 0.2 #set the width of the bars
ga_gsc_data['Clicks'][:10].plot(kind='bar', #type of chart to use
color='yellow',
ax=ax, #what matplotlinb axes to use
width=width, #set the bar width
position=1, #the postion of the bar relative to the others
fontsize=10) #set font size
ga_gsc_data['Pageviews'][:10].plot(kind='bar',
color='lightgreen',
ax=ax2,
width=width,
position=0)
#Create ax customatisations
ax.annotate('Lots of Pageviews but few clicks',
xy=(8, 50),
xytext=(2, 120),
arrowprops=dict(shrink=0.05, #length of arrow
lw=1, #line width
zorder=1) #layering order of annotation
)
ax.set_xlabel('Page') #name the x axis
ax.set_ylabel('Clicks') # name the primary y axis
ax.set_xticklabels(ax.get_xticklabels(), #get a strings with the label names
rotation=45) #Rotate the xticks labels by degrees
#Create ax customatisations
ax2.annotate('Lots of Pageviews and clicks',
xy=(0, 1398),
xytext=(1, 1398),
arrowprops=dict(shrink=0.05, #length of arrow
lw=1, #line width
zorder=1) #layering order of annotation
)
ax2.set_ylabel('Pageviews') #name the secondary y axis
#Global Plot settings
plt.title('Bar Chart: Clicks vs Pageview') # set chart name
fig.legend() # set the legend
#display the charts
plt.show()
plt.style.use('default') # restores the notebook back to default palette
Seaborn is a plotting package built on matplotlib
and allows for very pretty plots with very little code. The syntax is different from matplotlib and pandas plot but you'll soon get a handle on it.
We imported seaborn as sns
earlier in the notebook.
In this section we're going to plot a scatter graph in Seaborn and customise it with colours and sizes to make the data more readable.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
sns.scatterplot(x='Impressions', # x axes
y="Clicks", # y axes
data=ga_gsc_data) #the dataset we're using
#display the charts
plt.show()
For this plot we will add the hue
argument. This will allow us to apply a different hue to the dots based on a numerical value in the column we selected.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
sns.scatterplot(x='Impressions', # x axes
y="Clicks", # y axes
hue='Position',# colouring the dots based on a num value
data=ga_gsc_data) #the dataset we're using
#display the charts
plt.show()
This time we will add the size
argument. This will allow us to apply a different size to the dots based on a numerical value in the column we selected.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
sns.scatterplot(x='Impressions',# x axes
y="Clicks",# xy axes
hue='Position',# colouring the dots based on a num value
size="Pageviews", # increasing the size of the dots based on a num values
data=ga_gsc_data) #the dataset we're using
#display the charts
plt.show()
Finally we will pass a colour `pallete` to the scatter chart. This will change the colours of the plotted dots. In this case we are going to pass colour pallette "Set2".
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
sns.scatterplot(x='Impressions',# x axes
y="Clicks",# xy axes
hue='Position',# colouring the dots based on a num value
size="Pageviews", # increasing the size of the dots based on a num values
palette="Set2", # selecting a colour palette
data=ga_gsc_data) #the dataset we're using
#display the charts
plt.show()
It's difficult to argue that heatmaps are impressive looking visualsations. They're also incredibly useful when you correlation data into them. A heatmap with correlation data can help you find related features in a dataset that you should focus on. Pandas together with Seaborn makes it very easy to create correlations that you can plot in a heatmap and get answer fast.
#Load the facebook data fb_example_data.csv
fb_camp_data = pd.read_csv(r"C:\Users\jki\Downloads\other_data\fb_example_data.csv")
fb_camp_data.head()
Results | Reach | Frequency | Cost per results | Budget | Amount spent (GBP) | Clicks (all) | CTR (all) | CPC (all) (GBP) | Impressions | CPM (cost per 1,000 impressions) (GBP) | Link clicks | CPC (cost per link click) (GBP) | CTR (link click-through rate) | Website purchases | Website purchases conversion value | Website purchase ROAS (return on advertising spend) | Button clicks | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 105 | 77905 | 1.182132 | 2.871238 | 187.2 | 301.48 | 2570 | 2.790627 | 0.117307 | 92094 | 3.273612 | 2126 | 0.141806 | 2.308511 | 105 | 10927.20 | 36.245190 | 482 |
1 | 77 | 59035 | 1.204590 | 3.910130 | 187.2 | 301.08 | 1846 | 2.595869 | 0.163099 | 71113 | 4.233825 | 1580 | 0.190557 | 2.221816 | 77 | 8525.30 | 28.315730 | 319 |
2 | 49 | 48718 | 1.254116 | 6.135102 | 187.2 | 300.62 | 1485 | 2.430521 | 0.202438 | 61098 | 4.920292 | 1223 | 0.245805 | 2.001702 | 49 | 5955.55 | 19.810891 | 261 |
3 | 66 | 66413 | 1.259859 | 3.957727 | 187.2 | 261.21 | 1703 | 2.035353 | 0.153382 | 83671 | 3.121870 | 1342 | 0.194642 | 1.603901 | 66 | 9079.10 | 34.757858 | 342 |
4 | 47 | 64472 | 1.320946 | 5.499787 | 187.2 | 258.49 | 1373 | 1.612184 | 0.188267 | 85164 | 3.035203 | 1099 | 0.235205 | 1.290451 | 47 | 4602.00 | 17.803397 | 257 |
#inspect the data
fb_camp_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19 entries, 0 to 18 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Results 19 non-null int64 1 Reach 19 non-null int64 2 Frequency 19 non-null float64 3 Cost per results 19 non-null float64 4 Budget 19 non-null float64 5 Amount spent (GBP) 19 non-null float64 6 Clicks (all) 19 non-null int64 7 CTR (all) 19 non-null float64 8 CPC (all) (GBP) 19 non-null float64 9 Impressions 19 non-null int64 10 CPM (cost per 1,000 impressions) (GBP) 19 non-null float64 11 Link clicks 19 non-null int64 12 CPC (cost per link click) (GBP) 19 non-null float64 13 CTR (link click-through rate) 19 non-null float64 14 Website purchases 19 non-null int64 15 Website purchases conversion value 19 non-null float64 16 Website purchase ROAS (return on advertising spend) 19 non-null float64 17 Button clicks 19 non-null int64 dtypes: float64(11), int64(7) memory usage: 2.8 KB
#Create a correlation matrix and store as fb_corr
fb_corr = fb_camp_data.corr()
#Use seaborn to draw a heatmap of the correlated features in our dataset
plt.figure(figsize=(20,16))
sns.heatmap(fb_corr, #load data
linewidth=0.5, #line width
cmap='RdBu_r', #prefered colour maps
vmin=-1, #min range of board
vmax=1, #max range of board
annot=True) #add annotation
#display the charts
plt.show()
Timeseries data is your bread and butter. Timeseries data is anything which is plotted overtime time...which is basically everything! Pandas has some great tools for manageing timeseries data easily so that you can get the knowledge from the data you need to build strategy effectively and quickly.
N.B. When sourcing timeseries data you will need to determine what time increments are best for you at that time. Pandas can help you deal with data at a microsecond level but it's not always useful to get a sample of data in seconds increments when your business sells one high ticket price item per week. Determine what the smallest time increment is that make sense for your business and start from there.
import pandas as pd # importing pandas library
import matplotlib.pyplot as plt #importing matplotlib library
%matplotlib inline
ga_page_traffic_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_pages.csv",
skiprows=386,
nrows=366,
usecols=['Day Index', 'Pageviews'],
parse_dates=['Day Index'],
index_col='Day Index')
ppc_data = pd.read_csv(r"C:\Users\jki\Downloads\other_data\ppc_data.csv",
parse_dates=['Day'],
index_col='Day')
Start every new file by exploring what the data is. Don't presume that your data is correct as this could cost you time and stress wondering why a plot or calculation isn't work.
ga_page_traffic_data.head()
Pageviews | |
---|---|
Day Index | |
2018-08-31 | 14 |
2018-09-01 | 6 |
2018-09-02 | 17 |
2018-09-03 | 14 |
2018-09-04 | 11 |
ga_page_traffic_data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 366 entries, 2018-08-31 to 2019-08-31 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pageviews 366 non-null int64 dtypes: int64(1) memory usage: 5.7 KB
ga_page_traffic_data.plot(figsize=(15,5))
<Axes: xlabel='Day Index'>
It can been seen in the previous visualisation of the data thatwe have some missing values. This could cause some skew in the data when we run calcuations on it. To fix this you need to take the average over the year and fill the spaces with that value.
ga_page_traffic_data.describe()
Pageviews | |
---|---|
count | 366.000000 |
mean | 9.065574 |
std | 16.412694 |
min | 0.000000 |
25% | 3.000000 |
50% | 6.000000 |
75% | 12.000000 |
max | 274.000000 |
#Create a variable for Pageview average
average_value = ga_page_traffic_data['Pageviews'].mean()
#Print the average
average_value
9.065573770491802
#Replace any values that are 0 with the average values
ga_page_traffic_data['Pageviews'].replace(0,
average_value,
inplace=True)
#Plot chart to check missing values have been replaced
ga_page_traffic_data.plot(figsize=(15,5))
<Axes: xlabel='Day Index'>
You'll always want to look at timeseries data in at least Daily, Weekly, Monthly, Quarterly and Yearly time . Usually you are looking for trends or issues in the data. Pandas uses a method call resampling
to resample dates in to specificed time increments and group numerical data by a chosen calculation e.g. average.
#Let's look at the data by Month using the resample method
ga_page_monthly = ga_page_traffic_data.resample('M').mean()
#Plot a monthly line graph
ga_page_monthly.plot(figsize=(15,5))
<Axes: xlabel='Day Index'>
#Plot a quarterly line graph
ga_page_quarterly = ga_page_traffic_data.resample('Q', convention='end').mean()
#Plot a quarterly line graph
ga_page_quarterly.plot(figsize=(15,5))
<Axes: xlabel='Day Index'>
There are two ways to view an average over time. First is the resampling which takes the average over set range of 7 day. The other is rolling average which calcualtes the averages in a rolling 7 day window. The primary difference is that resampling does not overlap on dates and rolling does. This will give you more accurate account of the average for a period than resampling will. Both are useful but it's situation dependant.
In this tutorial we'll be using the weekly average to investigae the rolling price of our PPC campaign. This is an important metric to investigate as being able to optimise spend based on historical trends will improve the campaigns return on investment. There are two methods for viewing the weekly average, I'll cover them both.
ppc_data.head()
Unnamed: 0 | Ad group | Avg. CPC | CTR | Campaign | Clicks | Conv. rate | Conversions | Cost | Cost / conv. | Final URL | Impr. | Keyword | Keyword status | Max. CPC | Mobile final URL | Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Day | |||||||||||||||||
2018-08-31 | 0 | miratrix_ppc_adgroup | 2.490563 | 16.894144 | miratrix_ppc | 75 | 0.225225 | 1 | 1105.809783 | 1105.809783 | Text | 7501 | keyword | Active | 2.6 | Text | Active |
2018-09-01 | 1 | miratrix_ppc_adgroup | 2.472161 | 5.976134 | miratrix_ppc | 17 | 1.193317 | 5 | 1035.835572 | 207.167114 | Text | 2504 | keyword | Active | 2.6 | Text | Active |
2018-09-02 | 2 | miratrix_ppc_adgroup | 2.508561 | 6.824919 | miratrix_ppc | 3 | 0.107411 | 1 | 2335.470042 | 2335.470042 | Text | 6354 | keyword | Active | 2.6 | Text | Active |
2018-09-03 | 3 | miratrix_ppc_adgroup | 2.548919 | 8.058366 | miratrix_ppc | 99 | 2.723735 | 7 | 655.072102 | 93.581729 | Text | 2071 | keyword | Active | 2.6 | Text | Active |
2018-09-04 | 4 | miratrix_ppc_adgroup | 2.414868 | 6.884937 | miratrix_ppc | 93 | 1.464435 | 7 | 1154.306794 | 164.900971 | Text | 3291 | keyword | Active | 2.6 | Text | Active |
#Lets look at the cost data from a ppc dataset
ppc_data['Cost'].plot(figsize=(15,5))
<Axes: xlabel='Day'>
#Lets resample that data by week and take the average
ppc_weekly = ppc_data['Cost'].resample('W').mean()
ppc_weekly.head()
Day 2018-09-02 1492.371799 2018-09-09 1073.762622 2018-09-16 1554.423832 2018-09-23 1322.445411 2018-09-30 1041.005947 Freq: W-SUN, Name: Cost, dtype: float64
ppc_weekly.plot(figsize=(15,5))
<Axes: xlabel='Day'>
ppc_rolling_7d = ppc_data['Cost'].rolling(window=7, #set a rolling window of x days
center=True # label each window at its midpoint
).mean()
ppc_rolling_7d.head(10)
Day 2018-08-31 NaN 2018-09-01 NaN 2018-09-02 NaN 2018-09-03 1248.123595 2018-09-04 1319.591703 2018-09-05 1310.633560 2018-09-06 1073.762622 2018-09-07 1320.345163 2018-09-08 1317.992852 2018-09-09 1089.016399 Name: Cost, dtype: float64
ppc_rolling_7d.plot(figsize=(15,5))
<Axes: xlabel='Day'>
Here we will plot three different CPC data points, Daily, Resampled Weekly and Rolling Weekly then overlay and compare the output.
fig = plt.figure(figsize=(15,5)) # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
figsize = (15,8)
ppc_data['Cost'].plot(figsize=figsize,
alpha=.5,
marker='.',
linestyle='-',
linewidth=0.5,
label='Daily'
)
ppc_weekly.plot(figsize=figsize,
marker='x',
markersize=8,
linestyle='-',
label='Weekly Mean Resample'
)
ppc_rolling_7d.plot(figsize=figsize,
marker='o',
linestyle='-',
label='7-d Rolling Mean'
)
ax.set_xlabel('Date') #name the x axis
ax.set_ylabel('Cost Per Click (CPC)') # name the primary y axis
#Global Plot settings
plt.title('COMPARE: Daily, Weekly Mean, 7-d Rolling Mean ') # set chart name
fig.legend() # set the legend
#display the charts
plt.show()
You want to make as much reusable code as possible and you want to have very clear reports that anyone can read. In marketing we reguarly want to look for the highest and lowest CPC we can created a annotation to do this dynamically. Whenever we run this code on a new ppc dataset it will automatically highligh what the max and min CPC is.
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111)
figsize = (15,8)
ppc_data['Cost'].plot(figsize=figsize,
alpha=.5,
marker='.',
linestyle='-',
linewidth=0.5,
label='Daily'
)
ppc_weekly.plot(figsize=figsize,
marker='x',
markersize=8,
linestyle='-',
label='Weekly Mean Resample'
)
ppc_rolling_7d.plot(figsize=figsize,
marker='o',
linestyle='-',
label='7-d Rolling Mean'
)
max_value = ppc_rolling_7d.max()
max_value_index = ppc_rolling_7d.idxmax()
min_value = ppc_rolling_7d.min()
min_value_index = ppc_rolling_7d.idxmin()
#Create dynamic annotations for max values
ax.annotate('Highest CPC: £{}'.format(max_value.round()),
xy=(max_value_index,max_value),
xytext=(max_value_index,max_value),
arrowprops=dict(facecolor='cyan', #colour
shrink=0.05, #length of arrow
lw=1, #line width
ec='magenta', #boarder colour
zorder=1)) #layering order of annotation
#Create dynamic annotations for min values
ax.annotate('Lowest CPC: £{}'.format(min_value.round()),
xy=(min_value_index,min_value),
xytext=(min_value_index,min_value),
arrowprops=dict(facecolor='cyan', #colour
shrink=0.0, #length of arrow
lw=1, #line width
ec='magenta', #boarder colour
zorder=1)) #layering order of annotation
#Global Plot settings
plt.title('COMPARE: Daily, Weekly Mean, 7-d Rolling Mean ') # set chart name
fig.legend() # set the legend
#display the charts
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[162], line 37 32 min_value_index = ppc_rolling_7d.idxmin() 36 #Create dynamic annotations for max values ---> 37 ax.annotate('Highest CPC: £{}'.format(max_value.round()), 38 xy=(max_value_index,max_value), 39 xytext=(max_value_index,max_value), 40 arrowprops=dict(facecolor='cyan', #colour 41 shrink=0.05, #length of arrow 42 lw=1, #line width 43 ec='magenta', #boarder colour 44 zorder=1)) #layering order of annotation 46 #Create dynamic annotations for min values 47 ax.annotate('Lowest CPC: £{}'.format(min_value.round()), 48 xy=(min_value_index,min_value), 49 xytext=(min_value_index,min_value), (...) 53 ec='magenta', #boarder colour 54 zorder=1)) #layering order of annotation AttributeError: 'float' object has no attribute 'round'
Calculations and creating new meterics in python is very simple and it can open up a whole world of benchmarks and metrics in which to govern your marketing activities. In this notebook we'll look various ways of creating and caltulating metrics.
import pandas as pd # importing pandas library
ga_gsc_data = pd.read_csv(r"C:\Users\jki\Downloads\other_data\ga_gsc_data.csv")
ga_gsc_data.head()
Unnamed: 0 | Page | Clicks | Impressions | CTR | Position | Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | https://miratrix.co.uk/ | 371 | 100673 | 0.37% | 42.59 | 0 days 00:01:43.666666666 | 47.296667 | 54.286667 | 1396 | 1095 |
1 | 1 | https://miratrix.co.uk/app-marketing-agency/ | 80 | 30824 | 0.26% | 26.84 | 0 days 00:01:45.333333333 | 64.436667 | 60.913333 | 210 | 181 |
2 | 2 | https://miratrix.co.uk/mobile-app-competitor-a... | 16 | 1458 | 1.1% | 30.64 | 0 days 00:04:06.000000000 | 53.335000 | 89.475000 | 30 | 26 |
3 | 3 | https://miratrix.co.uk/blippar-the-message-isn... | 11 | 517 | 2.13% | 36.05 | 0 days 00:00:22.333333333 | 58.170000 | 78.333333 | 27 | 21 |
4 | 4 | https://miratrix.co.uk/author/nduddy/ | 10 | 140 | 7.14% | 5.61 | 0 days 00:02:52.000000000 | 22.223333 | 38.890000 | 15 | 13 |
ga_gsc_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 193 entries, 0 to 192 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 193 non-null int64 1 Page 193 non-null object 2 Clicks 193 non-null int64 3 Impressions 193 non-null int64 4 CTR 193 non-null object 5 Position 193 non-null float64 6 Avg. Time on Page 193 non-null object 7 Bounce Rate 193 non-null float64 8 Exit 193 non-null float64 9 Pageviews 193 non-null int64 10 Unique Pageviews 193 non-null int64 dtypes: float64(3), int64(5), object(3) memory usage: 16.7+ KB
In this section you'll recalculate the the CTR field and replace the current object with a float. You will also create two new metrics from the current data you have.
#Calculating CTR and replace the CTR column
ga_gsc_data.CTR = (ga_gsc_data.Clicks.div(ga_gsc_data.Impressions)*100).round(2)
ga_gsc_data.CTR.head()
0 0.37 1 0.26 2 1.10 3 2.13 4 7.14 Name: CTR, dtype: float64
From the bouce percentage lets calculate the a interger value for number of visitors that bounced. We can use this to create another CTR meteric which will show us how many loyal visitors are clicking through.
#Calculate the number of bounces
ga_gsc_data['num_bounces'] = (ga_gsc_data['Clicks'] - (ga_gsc_data.Exit.div(100).multiply(
ga_gsc_data['Clicks']))).astype(int)
ga_gsc_data.head()
Unnamed: 0 | Page | Clicks | Impressions | CTR | Position | Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | num_bounces | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | https://miratrix.co.uk/ | 371 | 100673 | 0.37 | 42.59 | 0 days 00:01:43.666666666 | 47.296667 | 54.286667 | 1396 | 1095 | 169 |
1 | 1 | https://miratrix.co.uk/app-marketing-agency/ | 80 | 30824 | 0.26 | 26.84 | 0 days 00:01:45.333333333 | 64.436667 | 60.913333 | 210 | 181 | 31 |
2 | 2 | https://miratrix.co.uk/mobile-app-competitor-a... | 16 | 1458 | 1.10 | 30.64 | 0 days 00:04:06.000000000 | 53.335000 | 89.475000 | 30 | 26 | 1 |
3 | 3 | https://miratrix.co.uk/blippar-the-message-isn... | 11 | 517 | 2.13 | 36.05 | 0 days 00:00:22.333333333 | 58.170000 | 78.333333 | 27 | 21 | 2 |
4 | 4 | https://miratrix.co.uk/author/nduddy/ | 10 | 140 | 7.14 | 5.61 | 0 days 00:02:52.000000000 | 22.223333 | 38.890000 | 15 | 13 | 6 |
ga_gsc_data.insert(4, #position
'CTR true visits', #name of column
(ga_gsc_data.num_bounces.div(ga_gsc_data.Impressions)*100).round(2) #value
)
ga_gsc_data.head()
Unnamed: 0 | Page | Clicks | Impressions | CTR true visits | CTR | Position | Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | num_bounces | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | https://miratrix.co.uk/ | 371 | 100673 | 0.17 | 0.37 | 42.59 | 0 days 00:01:43.666666666 | 47.296667 | 54.286667 | 1396 | 1095 | 169 |
1 | 1 | https://miratrix.co.uk/app-marketing-agency/ | 80 | 30824 | 0.10 | 0.26 | 26.84 | 0 days 00:01:45.333333333 | 64.436667 | 60.913333 | 210 | 181 | 31 |
2 | 2 | https://miratrix.co.uk/mobile-app-competitor-a... | 16 | 1458 | 0.07 | 1.10 | 30.64 | 0 days 00:04:06.000000000 | 53.335000 | 89.475000 | 30 | 26 | 1 |
3 | 3 | https://miratrix.co.uk/blippar-the-message-isn... | 11 | 517 | 0.39 | 2.13 | 36.05 | 0 days 00:00:22.333333333 | 58.170000 | 78.333333 | 27 | 21 | 2 |
4 | 4 | https://miratrix.co.uk/author/nduddy/ | 10 | 140 | 4.29 | 7.14 | 5.61 | 0 days 00:02:52.000000000 | 22.223333 | 38.890000 | 15 | 13 | 6 |
We all need to filter data regularly, it's a daily task. Filtering data in pandas simple as long as you remember the correct basic operator to use e.g >= or <=.
#Filter the data by pages that have recieved 100 or more clicks
ga_gsc_data[ga_gsc_data['Clicks'] >= 100]
Unnamed: 0 | Page | Clicks | Impressions | CTR true visits | CTR | Position | Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | num_bounces | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | https://miratrix.co.uk/ | 371 | 100673 | 0.17 | 0.37 | 42.59 | 0 days 00:01:43.666666666 | 47.296667 | 54.286667 | 1396 | 1095 | 169 |
#Filter the data by pages that have recieved 100 or more clicks
filter1 = ga_gsc_data[(ga_gsc_data['Position'] <= 10) & (ga_gsc_data['Impressions'] >= 100)]
filter1
Unnamed: 0 | Page | Clicks | Impressions | CTR true visits | CTR | Position | Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | num_bounces | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 4 | https://miratrix.co.uk/author/nduddy/ | 10 | 140 | 4.29 | 7.14 | 5.61 | 0 days 00:02:52.000000000 | 22.223333 | 38.89 | 15 | 13 | 6 |
14 | 14 | https://miratrix.co.uk/miratrix-23-golden-squa... | 3 | 227 | 0.00 | 1.32 | 7.59 | 0 days 00:00:00.000000000 | 0.000000 | 100.00 | 4 | 4 | 0 |
34 | 36 | https://miratrix.co.uk/droidcon-not-just-for-d... | 1 | 166 | 0.00 | 0.60 | 8.00 | 0 days 00:00:00.000000000 | 0.000000 | 100.00 | 1 | 1 | 0 |
#Filter and sort your data
ga_gsc_data[ga_gsc_data['Impressions'] >= 1000].sort_values(by="CTR true visits",
ascending=False)
Unnamed: 0 | Page | Clicks | Impressions | CTR true visits | CTR | Position | Avg. Time on Page | Bounce Rate | Exit | Pageviews | Unique Pageviews | num_bounces | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | https://miratrix.co.uk/ | 371 | 100673 | 0.17 | 0.37 | 42.59 | 0 days 00:01:43.666666666 | 47.296667 | 54.286667 | 1396 | 1095 | 169 |
1 | 1 | https://miratrix.co.uk/app-marketing-agency/ | 80 | 30824 | 0.10 | 0.26 | 26.84 | 0 days 00:01:45.333333333 | 64.436667 | 60.913333 | 210 | 181 | 31 |
2 | 2 | https://miratrix.co.uk/mobile-app-competitor-a... | 16 | 1458 | 0.07 | 1.10 | 30.64 | 0 days 00:04:06.000000000 | 53.335000 | 89.475000 | 30 | 26 | 1 |
8 | 8 | https://miratrix.co.uk/app-store-optimization-... | 5 | 4177 | 0.02 | 0.12 | 70.02 | 0 days 00:01:31.666666666 | 61.906667 | 60.256667 | 198 | 160 | 1 |
82 | 93 | https://miratrix.co.uk/seo-agency-london/ | 0 | 1496 | 0.00 | 0.00 | 215.61 | 0 days 00:00:32.500000000 | 100.000000 | 66.665000 | 4 | 4 | 0 |
In this section we're going build! You've got all our data cleaned and processed. Now it's time to start making it work for you. In this notebook we'll put together a very simple alerts function that you can customised all day long.
import pandas as pd
import matplotlib.pyplot as plt #importing matplotlib library
%matplotlib inline
ga_page_traffic_data = pd.read_csv(r"C:\Users\jki\Downloads\ga_data\ga_pages.csv",
skiprows=386, nrows=366,
usecols=['Day Index', 'Pageviews'],
parse_dates=['Day Index'],
index_col='Day Index')
ga_page_traffic_data.head()
Pageviews | |
---|---|
Day Index | |
2018-08-31 | 14 |
2018-09-01 | 6 |
2018-09-02 | 17 |
2018-09-03 | 14 |
2018-09-04 | 11 |
ga_page_traffic_data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 366 entries, 2018-08-31 to 2019-08-31 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pageviews 366 non-null int64 dtypes: int64(1) memory usage: 5.7 KB
In this viseo we'll look at creating an alert that signals whether a number is positive, negative or no change.
#Create a variable called num and assign it a integer
num = -1
#Write an if statement that prints if the number is up, down or no change
if num > 0:
print("It's UP!")
elif num == 0:
print("No Change")
else:
print("It's DOWN!")
It's DOWN!
We're goign to create a way of filtering our data by date range. To do this we need a start and end date.
start = '2018-08-31'
end = '2018-09-30'
#Create a mask that will select two dates. A mask will return a boolean (True/False) array
mask = ((ga_page_traffic_data.index == start) | (ga_page_traffic_data.index == end))
mask[:30]
array([ True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False])
#Apply the mask to the dataframe
ga_page_traffic_data[mask]
Pageviews | |
---|---|
Day Index | |
2018-08-31 | 14 |
2018-09-30 | 2 |
#Now we want to calculate the difference between the two rows and select the number
diff = ga_page_traffic_data[mask].diff().max()[0]
diff
-12.0
ga_page_traffic_data[mask].diff().max()[0]
-12.0
Now that we can select data and check the differene between two rows it's possible to create an alert that will get us the data we need if something has went wrong.
start = '2018-08-31'
end = '2018-09-30'
def alerts(df, start, end):
mask = ((df.index == start) | (df.index == end)) #Create a mask
diff = df[mask].diff().max()[0] #select the difference between the two dates
chart = df[mask] #create a df with the mask applied
if diff > 0:
print("Traffic is up by {}".format(diff)) #print if traffic is above 0
elif diff == 0:
print("No Change") #print if traffic is unchanged
else:
print("Traffic is down by {}".format(diff)) #print if traffic is below 0
#Print a bar chart to show the troublesome data
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111)
figsize = (15,8)
chart.plot(kind='bar',
figsize=figsize,
linewidth=0.5,
ax=ax
)
#Global Plot settings
plt.title('{} - {} : Traffic Drop in {}'.format(start,
end,
df.columns.values)) # set chart name
#display the charts
plt.show()
#Run the function
alerts(ga_page_traffic_data, start, end)
Traffic is down by -12.0