import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, percentileofscore
%matplotlib inline

# Loading the dataset
prices = pd.read_csv("data/average_flat_prices.csv")

# Printing dataframe information
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5436 entries, 0 to 5435
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        5436 non-null   object 
 1   Code        5436 non-null   object 
 2   Price       5436 non-null   float64
 3   Month       5436 non-null   object 
 4   RegionName  5436 non-null   object 
 5   RegionCode  5436 non-null   object 
dtypes: float64(1), object(5)
memory usage: 254.9+ KB

# Converting month column into datetime type
prices["Month"] = pd.to_datetime(prices["Month"])
print("'Month' Dtype:", prices["Month"].dtype)

'Month' Dtype: datetime64[ns]

# Filtering the data for July 2017 and July 2018
july_2017 = prices[prices["Month"] == "2017-07-01"]
july_2018 = prices[prices["Month"] == "2018-07-01"]

# Merging the data into a new dataframe
price_changes = pd.merge(july_2017[["Area", "RegionName", "Price"]],
                         july_2018[["Area", "RegionName", "Price"]],
                         on=["Area", "RegionName"],
                         suffixes=("_2017", "_2018"))

# Calculating the price difference and percentage change
price_changes["Price Difference"] = price_changes["Price_2018"] - price_changes["Price_2017"]
price_changes["Percentage Change"] = ((price_changes["Price_2018"] - price_changes["Price_2017"]) / price_changes["Price_2017"]) * 100

# Sorting the data by RegionName alphabetically
price_changes = price_changes.sort_values("RegionName", ascending=False)
region_names = price_changes["RegionName"].unique()

# Setting the plot style
plt.style.use("fivethirtyeight")

# Creating the dot plot
fig, ax = plt.subplots(figsize=(8, 5))

for region in price_changes["RegionName"].unique():
    region_data = price_changes[price_changes["RegionName"] == region]
    ax.scatter(region_data["Percentage Change"], [region] * len(region_data),
               s=abs(region_data["Price Difference"]) / 100, alpha=0.7, edgecolor="k")

ax.set_yticks(range(len(region_names)))
ax.set_yticklabels(region_names, fontsize=10)
ax.set_xlabel("Percentage Change (%)", fontsize=12, fontweight="bold", labelpad=10)
ax.set_ylabel("Region", fontsize=12, fontweight="bold", labelpad=-18)
ax.grid(axis="x", linestyle="--", alpha=0.7)

x_ticks = ax.get_xticks()
ax.set_xticks(x_ticks)
ax.set_xticklabels([f"{tick:.0f}" for tick in x_ticks], fontsize=10)

# Adding a legend that explains dot sizes
sizes = [500, 5000, 50000]
labels = [f"{size:,}".replace(",", "'") for size in sizes]
for size, label in zip(sizes, labels):
    ax.scatter([], [], s=size / 100, alpha=0.7, edgecolor="k", color="grey", label=f"£{label}")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[-len(sizes):], labels[-len(sizes):], title="Price change value", bbox_to_anchor=(1.0, 1),
          loc="upper left", frameon=True, labelspacing=2, handletextpad=1, borderpad=0.8, fontsize=10, title_fontsize=10)

# Titles and signature
plt.text(-0.27, 1.08,
         "Percentage change of average flat prices per region between July 2017 and July 2018", 
         size=14, weight="bold", color="black", 
         transform=plt.gca().transAxes)
plt.text(-0.27, 1.02,
         "Dot sizes represent the absolute price change for each area", 
         size=13, color="black", 
         transform=plt.gca().transAxes)
plt.text(-0.27, -0.2,
         "RASHAD MALIK" + " " * 128 + "Source: average-flat-prices.csv", 
         color="#f0f0f0", 
         backgroundcolor="#4d4d4d", 
         fontsize=12, 
         transform=plt.gca().transAxes)

plt.show()

# Changing the order of region names so that it remains alphabetical for the next visualisation
price_changes = price_changes.sort_values("RegionName", ascending=True)
regions = price_changes["RegionName"].unique()

fig, axes = plt.subplots(3, 3, figsize=(20, 20))  # 5 rows, 2 columns
axes = axes.flatten()
for i, region in enumerate(regions):
    regional_data = price_changes[price_changes["RegionName"] == region]
    regional_data = regional_data.sort_values(by="Area", ascending=False)
    colours = regional_data["Price Difference"].apply(lambda x: "green" if x > 0 else "red")
    ax = axes[i]
    ax.barh(regional_data["Area"], regional_data["Price Difference"], color=colours)
    ax.set_title(region, fontsize=12, weight="bold")
    ax.set_xlabel("Price Change (£)", fontsize=12)
    ax.tick_params(axis='x', labelsize=10)
    ax.tick_params(axis='y', labelsize=8)

plt.subplots_adjust(wspace=0.35, hspace=0.2)

# Titles and signature
plt.text(-3.08, 3.55,
         "Average flat price changes between July 2017 and July 2018", 
         size=14, weight="bold", color="black", 
         transform=plt.gca().transAxes)
plt.text(-3.08, 3.5,
         "Regional subplots with price changes per area", 
         size=13, color="black", 
         transform=plt.gca().transAxes)
plt.text(-3.08, -0.15,
         "RASHAD MALIK" + " " * 300 + "Source: average-flat-prices.csv", 
         color="#f0f0f0", 
         backgroundcolor="#4d4d4d", 
         fontsize=12, 
         transform=plt.gca().transAxes)

plt.show()

# Creating a new column
price_changes["Change Direction"] = price_changes.apply(
    lambda row: "Increased" if row["Price_2018"] > row["Price_2017"] 
    else "Decreased" if row["Price_2018"] < row["Price_2017"] 
    else "No Change",
    axis=1
)

# Using groupby to create a crosstab result
crosstab_result = (
    price_changes.groupby("RegionName")["Change Direction"]
    .value_counts()
    .unstack(fill_value=0)
)

crosstab_result

# Calculating the G-test statistic and the p-value
test_stat, p_value, dof, expected = chi2_contingency(crosstab_result, correction=False,
                                                     lambda_ = "log-likelihood")
print('The G test statistic is %4.2f' % test_stat)
print('The p-value is %4.3f %%' % (100 * p_value))

The G test statistic is 32.19
The p-value is 0.009 %

# Setting a random seed
seed = 42
np.random.seed(seed)

# Permutation test
permuted_g_stats = []
for _ in range(20000):
    shuffled = price_changes["Change Direction"].sample(frac=1, replace=False, random_state=np.random.randint(0, 1e6)).reset_index(drop=True)
    permuted_table = pd.crosstab(price_changes["RegionName"], shuffled)
    permuted_g, _, _, _ = chi2_contingency(permuted_table, lambda_="log-likelihood")
    permuted_g_stats.append(permuted_g)

# Calculating the p-value
p_value = np.mean(np.array(permuted_g_stats) >= test_stat)

# Unique tables count
unique_tables = len(np.unique(permuted_g_stats))

# Calculating p-value percent
p_value_percent = 100 * (1 - percentileofscore(permuted_g_stats, test_stat, kind="strict") / 100)

print(f"Random seed used: {seed}")
print(f"Generated {unique_tables} unique tables out of 20'000 total permutations")
print(f"The p-value for the test is {p_value_percent:.3f} %")

Random seed used: 42
Generated 19087 unique tables out of 20'000 total permutations
The p-value for the test is 0.020 %

# Creating the G statistic histogram 
plt.hist(permuted_g_stats, bins=20, log=False, color="steelblue", edgecolor="black", alpha=0.7)
plt.axvline(test_stat, color="orange", linestyle="-", linewidth=1.5, label=f"Observed G-statistic ({test_stat:.2f})")
plt.xlabel("G-statistic", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.ylabel("Frequency", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.tick_params(axis="x", labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.gca().xaxis.set_minor_locator(plt.MultipleLocator(1))
plt.grid(which="minor", axis="x", linestyle=":", linewidth=1)
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(100))
plt.grid(which="minor", axis="y", linestyle=":", linewidth=0.7)

# Titles and signature
plt.text(-0.13, 1.11,
         "Distribution of permuted G statistics", 
         size=14, weight="bold", color="black", 
         transform=plt.gca().transAxes)
plt.text(-0.13, 1.05,
         "Over 20'000 permutations", 
         size=13, color="black", 
         transform=plt.gca().transAxes)
plt.text(-0.13, -0.2,
         "RASHAD MALIK" + " " * 48 + "Source: average-flat-prices.csv", 
         color="#f0f0f0", 
         backgroundcolor="#4d4d4d", 
         fontsize=12, 
         transform=plt.gca().transAxes)

plt.legend(fontsize=11)
plt.show()

Field Name	Description
Area	The name of a local government area
Code	The code of the area
Price	Average sale price of a flat (an apartment) property in this area in the month
Month	A date, which is the first of the month, between September 1st 2016 and August 1st 2019. 36 months in total
RegionName	The name of the region of which this area is part
RegionCode	The code of the region

Code Prefix	Type of Area
E10	County
E09	London Borough
E08	Metropolitan Boroughs
E06	English unitary authority

Change Direction	Decreased	Increased
RegionName
East Midlands	0	9
East of England	5	6
London	23	10
North East	9	3
North West	12	11
South East	13	6
South West	6	9
West Midlands	2	12
Yorkshire and The Humber	7	8

Test Name	Observed G Statistic	p-value
Chi-Square Test	32.19	0.009%
Permutation Test	32.19	0.020%

Regional Property Price Analysis: England 2017-2018¶

Project aim and outline¶

Introduction¶

Importing libraries¶

The dataset¶

Dataset description and variables¶

Data loading¶

Analysis¶

Part 1: Calculating and displaying price changes¶

1.1 Calculating price changes¶

1.2 Visualising price changes¶

1.3 Commenting on results¶

Part 2: Statistical analysis¶

2.1 Cross-tabulation¶

2.2 Chi-square test and permutation test¶

Part 3: Discussion¶

Summary and conclusion¶

References¶