The Office of Policy and Management maintains a listing of all real estate sales with a sales price of $2,000 or greater that occur between October 1 and September 30 of each year. For each sale record, the file includes: town, property address, date of sale, property type (residential, apartment, commercial, industrial or vacant land), sales price, and property assessment.
# import libraries
import pandas as pd
import numpy as np
#load dataset
real_estate_df = pd.read_csv(r"C:\Users\jki\Downloads\Real_Estate_Sales_2001-2020_GL.csv")
real_estate_df.head(5)
C:\Users\jki\AppData\Local\Temp\ipykernel_9032\2561183663.py:6: DtypeWarning: Columns (8,9,10,11,12) have mixed types. Specify dtype option on import or set low_memory=False. real_estate_df = pd.read_csv(r"C:\Users\jki\Downloads\Real_Estate_Sales_2001-2020_GL.csv")
Serial Number | List Year | Date Recorded | Town | Address | Assessed Value | Sale Amount | Sales Ratio | Property Type | Residential Type | Non Use Code | Assessor Remarks | OPM remarks | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020348 | 2020 | 09/13/2021 | Ansonia | 230 WAKELEE AVE | 150500.0 | 325000.0 | 0.4630 | Commercial | NaN | NaN | NaN | NaN | NaN |
1 | 20002 | 2020 | 10/02/2020 | Ashford | 390 TURNPIKE RD | 253000.0 | 430000.0 | 0.5883 | Residential | Single Family | NaN | NaN | NaN | NaN |
2 | 200212 | 2020 | 03/09/2021 | Avon | 5 CHESTNUT DRIVE | 130400.0 | 179900.0 | 0.7248 | Residential | Condo | NaN | NaN | NaN | NaN |
3 | 200243 | 2020 | 04/13/2021 | Avon | 111 NORTHINGTON DRIVE | 619290.0 | 890000.0 | 0.6958 | Residential | Single Family | NaN | NaN | NaN | NaN |
4 | 200377 | 2020 | 07/02/2021 | Avon | 70 FAR HILLS DRIVE | 862330.0 | 1447500.0 | 0.5957 | Residential | Single Family | NaN | NaN | NaN | NaN |
# lets check for missing values
missing_values = real_estate_df.isna().sum()
print(missing_values)
Serial Number 0 List Year 0 Date Recorded 2 Town 0 Address 51 Assessed Value 0 Sale Amount 0 Sales Ratio 0 Property Type 382446 Residential Type 388309 Non Use Code 707532 Assessor Remarks 847349 OPM remarks 987279 Location 799516 dtype: int64
# let remove the missing values
real_estate_df.dropna(subset=['Property Type','Residential Type','Non Use Code','Assessor Remarks','OPM remarks','Location'],inplace = True)
# lets confirm missing values are removed
missing_values = real_estate_df.isna().sum()
print(missing_values)
Serial Number 0 List Year 0 Date Recorded 0 Town 0 Address 0 Assessed Value 0 Sale Amount 0 Sales Ratio 0 Property Type 0 Residential Type 0 Non Use Code 0 Assessor Remarks 0 OPM remarks 0 Location 0 dtype: int64
# let check if we have unwanted negative values
real_estate_df.describe()
Serial Number | List Year | Assessed Value | Sale Amount | Sales Ratio | |
---|---|---|---|---|---|
count | 2.640000e+02 | 264.000000 | 2.640000e+02 | 2.640000e+02 | 264.000000 |
mean | 3.518540e+06 | 2018.598485 | 1.701267e+05 | 1.526461e+06 | 0.789047 |
std | 2.454170e+07 | 1.318830 | 1.675329e+05 | 1.960760e+07 | 0.619845 |
min | 1.701400e+04 | 2016.000000 | 3.770000e+03 | 4.500000e+03 | 0.000700 |
25% | 1.702550e+05 | 2017.000000 | 9.030250e+04 | 1.193750e+05 | 0.442072 |
50% | 1.901695e+05 | 2019.000000 | 1.378300e+05 | 1.888500e+05 | 0.647560 |
75% | 2.007905e+05 | 2020.000000 | 1.992075e+05 | 3.526500e+05 | 1.018919 |
max | 2.020002e+08 | 2020.000000 | 2.083410e+06 | 3.187900e+08 | 7.422222 |
# lets check for data types to perform numerical calculations
real_estate_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 264 entries, 806 to 993143 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Serial Number 264 non-null int64 1 List Year 264 non-null int64 2 Date Recorded 264 non-null object 3 Town 264 non-null object 4 Address 264 non-null object 5 Assessed Value 264 non-null float64 6 Sale Amount 264 non-null float64 7 Sales Ratio 264 non-null float64 8 Property Type 264 non-null object 9 Residential Type 264 non-null object 10 Non Use Code 264 non-null object 11 Assessor Remarks 264 non-null object 12 OPM remarks 264 non-null object 13 Location 264 non-null object dtypes: float64(3), int64(2), object(9) memory usage: 30.9+ KB
import locale
# Set the locale to your preferred format (e.g., US English)
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
# Calculate the rounded max
rounded_max = round(real_estate_df['Assessed Value'].max())
# Format and display as an accounting value
formatted_max = locale.currency(rounded_max, grouping=True)
print(formatted_max)
$2,083,410.00
import locale
# Set the locale to your preferred format (e.g., US English)
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
# Calculate the rounded mean
rounded_mean = round(real_estate_df['Assessed Value'].mean())
# Format and display as an accounting value
formatted_mean = locale.currency(rounded_mean, grouping=True)
print(formatted_mean)
$170,127.00
# Load in some packages
import calendar
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter
# Group by 'Town' and calculate the sum of 'Assesed Values'
towns_real_estate_values = real_estate_df.groupby('Town')['Assessed Value'].sum()
# Sort the values in descending order and select the top ten
top_ten_towns = towns_real_estate_values.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_towns)
# Plot the top ten towns
top_ten_towns.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Top Ten Towns with Most Assesed Values')
plt.xlabel('Town')
plt.ylabel('Assessed Value')
plt.show()
Town Danbury 9146700.0 Norwalk 6698810.0 East Lyme 2690590.0 New Milford 2644360.0 Farmington 1493760.0 Bridgeport 1363515.0 East Haven 1324850.0 New Haven 1172080.0 Cromwell 1038730.0 Woodbridge 946330.0 Name: Assessed Value, dtype: float64
# Group by 'Property Type' and calculate the sum of 'Assesed Values'
Property_Type_real_estate_values = real_estate_df.groupby('Property Type')['Assessed Value'].sum()
# Sort the values in descending order and select the top ten
top_ten_Property_Type = Property_Type_real_estate_values.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_Property_Type)
# Plot the top ten Property Values
top_ten_Property_Type.plot(kind='bar', color='red', figsize=(10, 6))
plt.title('Top Ten top Property Type as per Assesed Values')
plt.xlabel('Property Type')
plt.ylabel('Assessed Value')
plt.show()
Property Type Residential 19210790.0 Single Family 17872740.0 Condo 4319350.0 Three Family 1445475.0 Two Family 1326690.0 Four Family 738400.0 Name: Assessed Value, dtype: float64
# Group by 'Residential Type and calculate the sum of 'Assessed Value'
Residential_Type_real_estate_values = real_estate_df.groupby('Residential Type')['Assessed Value'].sum()
# Sort the values in descending order and select the top ten
top_ten_Residential_Type = Residential_Type_real_estate_values.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_Residential_Type)
# Plot the top ten Property Values
top_ten_Residential_Type.plot(kind='bar', color='brown', figsize=(10, 6))
plt.title('Top Ten top Residential Type as per Assesed Values')
plt.xlabel('Property Type')
plt.ylabel('Assesed Values')
plt.show()
Residential Type Single Family 32029990.0 Condo 7027790.0 Two Family 3324000.0 Three Family 1793265.0 Four Family 738400.0 Name: Assessed Value, dtype: float64
# lets fix up the dates
real_estate_df['Date Recorded'] = pd.to_datetime(real_estate_df['Date Recorded'])
real_estate_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 264 entries, 806 to 993143 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Serial Number 264 non-null int64 1 List Year 264 non-null int64 2 Date Recorded 264 non-null datetime64[ns] 3 Town 264 non-null object 4 Address 264 non-null object 5 Assessed Value 264 non-null float64 6 Sale Amount 264 non-null float64 7 Sales Ratio 264 non-null float64 8 Property Type 264 non-null object 9 Residential Type 264 non-null object 10 Non Use Code 264 non-null object 11 Assessor Remarks 264 non-null object 12 OPM remarks 264 non-null object 13 Location 264 non-null object dtypes: datetime64[ns](1), float64(3), int64(2), object(8) memory usage: 30.9+ KB
# lets create new column for month
real_estate_df['Month Recorded'] = real_estate_df['Date Recorded'].dt.month
real_estate_df.head(3)
Serial Number | List Year | Date Recorded | Town | Address | Assessed Value | Sale Amount | Sales Ratio | Property Type | Residential Type | Non Use Code | Assessor Remarks | OPM remarks | Location | Month Recorded | Year Recorded | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
806 | 200594 | 2020 | 2021-02-16 | Danbury | 8 HICKORY ST | 121600.0 | 146216.0 | 0.831646 | Residential | Single Family | 25 - Other | I11192 | HOUSE HAS SETTLED PER MLS | POINT (-73.44696 41.41179) | 2 | 2021 |
981 | 200562 | 2020 | 2021-02-03 | Danbury | 19 MILL RD | 263600.0 | 415000.0 | 0.635181 | Residential | Single Family | 25 - Other | AFFORDABLE HOUSING / B15001-20-19 | INCORRECT DATA PER TOWN RECORDS | POINT (-73.53692 41.38822) | 2 | 2021 |
1529 | 200260 | 2020 | 2020-11-23 | Danbury | 32 COALPIT HILL RD #4 | 84900.0 | 181778.0 | 0.467053 | Residential | Condo | 25 - Other | J16087-4 | MULTIPLE UNIT SALE | POINT (-73.43796 41.38549) | 11 | 2020 |
# lets create new column for Year
real_estate_df['Year Recorded'] = real_estate_df['Date Recorded'].dt.year
real_estate_df.head(3)
Serial Number | List Year | Date Recorded | Town | Address | Assessed Value | Sale Amount | Sales Ratio | Property Type | Residential Type | Non Use Code | Assessor Remarks | OPM remarks | Location | Month Recorded | Year Recorded | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
806 | 200594 | 2020 | 2021-02-16 | Danbury | 8 HICKORY ST | 121600.0 | 146216.0 | 0.831646 | Residential | Single Family | 25 - Other | I11192 | HOUSE HAS SETTLED PER MLS | POINT (-73.44696 41.41179) | 2 | 2021 |
981 | 200562 | 2020 | 2021-02-03 | Danbury | 19 MILL RD | 263600.0 | 415000.0 | 0.635181 | Residential | Single Family | 25 - Other | AFFORDABLE HOUSING / B15001-20-19 | INCORRECT DATA PER TOWN RECORDS | POINT (-73.53692 41.38822) | 2 | 2021 |
1529 | 200260 | 2020 | 2020-11-23 | Danbury | 32 COALPIT HILL RD #4 | 84900.0 | 181778.0 | 0.467053 | Residential | Condo | 25 - Other | J16087-4 | MULTIPLE UNIT SALE | POINT (-73.43796 41.38549) | 11 | 2020 |
# Load in some packages
import calendar
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter
warnings.filterwarnings("ignore")
import calendar
# Group by month and sum the 'Sale Amount'
monthly_sales = real_estate_df.groupby('Month Recorded')['Sale Amount'].sum()
# Find the month with the highest sales
max_month = monthly_sales.idxmax()
earnings_for_best_month = sales_by_month.max()
print(f"The month with the highest sales is {max_month} with earning of {earnings_for_best_month:,.2f}")
# Plot the highest number of sales for each month
plt.figure(figsize=(10, 6))
plt.bar(monthly_sales.index, monthly_sales, color='green')
plt.xlabel('Month')
plt.ylabel('Total Sales Amount')
plt.title('Total Sales Amount for Each Month')
plt.xticks(rotation=45)
plt.show()
The month with the highest sales is 7 with earning of 331,269,169.00
import calendar
# Group by Year and sum the 'Sale Amount'
yearly_sales = real_estate_df.groupby('Year Recorded')['Sale Amount'].sum()
# Find the month with the highest sales
max_year = yearly_sales.idxmax()
earnings_for_best_year = yearly_sales.max()
print(f"The Year with the highest sales is {max_year} with earning of {earnings_for_best_year:,.2f}")
# Plot the highest number of sales for each month
plt.figure(figsize=(10, 6))
plt.bar(yearly_sales.index, yearly_sales, color='purple')
plt.xlabel('Year')
plt.ylabel('Total Sales Amount')
plt.title('Total Sales Amount for Each Month')
plt.xticks(rotation=45)
plt.show()
The Year with the highest sales is 2021 with earning of 352,540,596.00
# Group by 'Town' and calculate the sum of 'Assesed Values'
towns_real_estate_values = real_estate_df.groupby('Town')['Sale Amount'].sum()
# Sort the values in descending order and select the top ten
top_ten_towns = towns_real_estate_values.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_towns)
# Plot the top ten counties
top_ten_towns.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Top Ten most expensive Towns ')
plt.xlabel('Town')
plt.ylabel('Sale Amount')
plt.show()
Town Willington 318790019.0 Danbury 17757439.0 Norwalk 15537677.0 East Lyme 8303750.0 New London 6148000.0 New Milford 3382500.0 Farmington 2826187.0 Bridgeport 1759267.0 Cromwell 1527500.0 East Haven 1468900.0 Name: Sale Amount, dtype: float64
# Group by 'Property Type' and calculate the sum of 'Assesed Values'
Property_Type_real_estate_values = real_estate_df.groupby('Property Type')['Sale Amount'].sum()
# Sort the values in descending order and select the top ten
top_ten_Property_Type = Property_Type_real_estate_values.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_Property_Type)
# Plot the top ten Property Values
top_ten_Property_Type.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Top Ten most expensive Property Types ')
plt.xlabel('Property Type')
plt.ylabel('Sale Amount')
plt.show()
Property Type Residential 362613502.0 Single Family 24145005.0 Four Family 6605000.0 Condo 6328190.0 Two Family 1765000.0 Three Family 1529000.0 Name: Sale Amount, dtype: float64
# Group by 'Residential Type and calculate the sum of 'Assessed Value'
Residential_Type_real_estate_values = real_estate_df.groupby('Residential Type')['Sale Amount'].sum()
# Sort the values in descending order and select the top ten
top_ten_Residential_Type = Residential_Type_real_estate_values.sort_values(ascending=False).head(10)
# Display the result
print(top_ten_Residential_Type)
# Plot the top ten Property Values
top_ten_Residential_Type.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Top Ten top Residential Type with Most Sale Amount')
plt.xlabel('Residential Type')
plt.ylabel('Sale Amount')
plt.show()
Residential Type Single Family 373705042.0 Condo 13935155.0 Four Family 6605000.0 Two Family 6131500.0 Three Family 2609000.0 Name: Sale Amount, dtype: float64