import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, root_mean_squared_error
%matplotlib inline

# Defining the categorical data type for bridge ratings with an explicit order
rating_type = pd.CategoricalDtype(
    categories=["Failed", "Failing", "Critical", "Serious", "Poor", "Fair",
                "Satisfactory", "Good", "Very Good", "Excellent", "NA"],
    ordered=True)

# Defining the categorical data type for scour ratings with an explicit order
scour_type = pd.CategoricalDtype(
    categories=["Unknown", "Critical", "Unstable", "Stable", "Protected", "Dry", "No Waterway"],
    ordered=True)

# Defining a dictionary specifying data types for each column in the dataset
types_dict = {
    "Structure_id": str,
    "Toll": "category",
    "Maintainer": "category",
    "Urban": "category",
    "Status": "category",
    "Historic": "category",
    "Service_under": "category",
    "Material": "category",
    "Design": "category",
    "Deck_rating": rating_type,
    "Superstr_rating": rating_type,
    "Substr_rating": rating_type,
    "Scour_rating": scour_type
}

# Load the dataset with specified data types and set Structure_id as the index column
bridges = pd.read_csv("data/tx19_bridges_sample.csv", dtype=types_dict, index_col="Structure_id")

# Print dataframe information
bridges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34293 entries, 000021521-00101 to DAPTRABLI000011
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   District         34293 non-null  object  
 1   Detour_Km        34293 non-null  int64   
 2   Toll             34293 non-null  category
 3   Maintainer       34293 non-null  category
 4   Urban            34293 non-null  category
 5   Status           34293 non-null  category
 6   Year             34293 non-null  int64   
 7   Lanes_on         34293 non-null  int64   
 8   Lanes_under      34293 non-null  int64   
 9   AverageDaily     34293 non-null  int64   
 10  Historic         34293 non-null  category
 11  Service_under    34293 non-null  category
 12  Material         34293 non-null  category
 13  Design           34293 non-null  category
 14  Spans            34293 non-null  int64   
 15  Length           34293 non-null  float64 
 16  Width            34293 non-null  float64 
 17  Deck_rating      34288 non-null  category
 18  Superstr_rating  34291 non-null  category
 19  Substr_rating    34293 non-null  category
 20  Rated_load       34293 non-null  float64 
 21  Trucks_percent   34293 non-null  float64 
 22  Scour_rating     24520 non-null  category
 23  Future_traffic   34293 non-null  int64   
dtypes: category(12), float64(4), int64(7), object(1)
memory usage: 4.8+ MB

# Initialising the new Age column with NaN values
bridges["Age"] = np.NaN
bridges["Age"] = bridges["Age"].astype(object)

# Calculating bridge age
bridges["Age"] = 2024 - bridges["Year"]

bridges["Age"].describe()

count    34293.000000
mean        42.502581
std         23.860135
min          5.000000
25%         22.000000
50%         39.000000
75%         60.000000
max        124.000000
Name: Age, dtype: float64

# Violin plot
sns.set_style("whitegrid")
plt.figure(figsize = (7, 4))
sns.violinplot(bridges, x="Historic", y="Age", cut=0, alpha=0.8)

plt.xlabel("Historic status", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.ylabel("Bridge age", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(5))
plt.grid(which="minor", axis="y", linestyle=":", linewidth=0.7)
plt.ylim(bottom=0)

plt.text(-1, 148, 
         "Age distribution of bridges", 
         size = 14, weight = "bold", color = "black")
plt.text(-1, 138, 
         "by historic status", 
         size = 13, color = "black")
plt.text(-1, -38, 
         "RASHAD MALIK" + " " * 42 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Grouping the data by historic category, and calculating minimum and maximum ages
age_stats = bridges.groupby("Historic", observed=False)["Age"].agg(["min", "max"])
age_stats

# Calculating the 90th percentile of the bridge ages
percentile_90 = bridges["Age"].quantile(0.9)

# Plotting the probability density plot
sns.kdeplot(bridges, x="Age", fill=True)
plt.xlabel("Bridge age", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.ylabel("Density", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(0.0005))
plt.grid(which="minor", axis="y", linestyle=":", linewidth=0.7)
plt.gca().xaxis.set_minor_locator(plt.MultipleLocator(5))
plt.grid(which="minor", axis="x", linestyle=":", linewidth=0.7)

# Adding a vertical line at the 90th percentile
plt.axvline(percentile_90, color="orange", linestyle="-", label=f"90th Percentile ({percentile_90:.2f})")

plt.text(-36.5, 0.02, 
         "Probability density of bridge age", 
         size = 14, weight = "bold", color = "black")
plt.text(-36.5, 0.0188, 
         "with 90th percentile highlighted", 
         size = 13, color = "black")
plt.text(-36.5, -0.0043, 
         "RASHAD MALIK" + " " * 42 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.legend()
plt.show()

# Excluding bridges over the 90th percentile age
bridges_filtered = bridges[bridges["Age"] <= percentile_90]

# Plotting histograms for "Design" and "Material"
fig, axes = plt.subplots(1, 2, figsize=(12.5, 5))
colors = sns.color_palette("muted")

# Histogram for Material
bridges_filtered["Material"].value_counts().plot(kind="bar", ax=axes[0], color=colors[2], alpha=0.8)
axes[0].set_xlabel("Material", fontsize = 12, fontweight = "bold", labelpad = 10)
axes[0].set_ylabel("Frequency", fontsize = 12, fontweight = "bold", labelpad = 10)
axes[0].tick_params(axis="x", rotation=0, labelsize=10)
axes[0].yaxis.set_minor_locator(plt.MultipleLocator(1000))
axes[0].grid(which="minor", axis="y", linestyle=":", linewidth=0.5)

# Histogram for Design
bridges_filtered["Design"].value_counts().plot(kind="bar", ax=axes[1], color=colors[4], alpha=0.8)
axes[1].set_xlabel("Design", fontsize = 12, fontweight = "bold", labelpad = -3)
axes[1].set_ylabel("Frequency", fontsize = 12, fontweight = "bold", labelpad = 5)
axes[1].tick_params(axis="x", rotation=30, labelsize=10)
axes[1].yaxis.set_minor_locator(plt.MultipleLocator(1000))
axes[1].grid(which="minor", axis="y", linestyle=":", linewidth=0.5)

plt.text(-11.6, 31000, 
         "Bar plots of categorical variables", 
         size = 14, weight = "bold", color = "black")
plt.text(-11.6, 29200, 
         "Bridge material and design", 
         size = 13, color = "black")
plt.text(-11.6, -6900, 
         "RASHAD MALIK" + " " * 140 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Assigning "Other" to non-matching values in "Material" and "Design"
bridges_filtered = bridges_filtered.copy()
bridges_filtered.loc[~bridges_filtered["Material"].isin(["Concrete", "Steel"]), "Material"] = "Other"
bridges_filtered.loc[~bridges_filtered["Design"].isin(["Beam", "Slab"]), "Design"] = "Other"

# Removing unused categories in both "Material" and "Design"
bridges_filtered["Material"] = bridges_filtered["Material"].cat.remove_unused_categories()
bridges_filtered["Design"] = bridges_filtered["Design"].cat.remove_unused_categories()

# Defining the mapping
rating_to_score = {
    "Failed": 0,
    "Failing": 1,
    "Critical": 2,
    "Serious": 3,
    "Poor": 4,
    "Fair": 5,
    "Satisfactory": 6,
    "Good": 7,
    "Very Good": 8,
    "Excellent": 9
}

# Applying the mapping and creating new "score" columns
bridges_filtered["Deck_score"] = bridges_filtered["Deck_rating"].map(rating_to_score)
bridges_filtered["Superstr_score"] = bridges_filtered["Superstr_rating"].map(rating_to_score)
bridges_filtered["Substr_score"] = bridges_filtered["Substr_rating"].map(rating_to_score)

# Calculating total score
bridges_filtered["Score"] = (bridges_filtered["Deck_score"] +
                                          bridges_filtered["Superstr_score"] +
                                          bridges_filtered["Substr_score"]
                                         )

# Checking for NaN values in the three scores
print("NaN in Deck_score:", bridges_filtered["Deck_score"].isna().sum())
print("NaN in Superstr_score:", bridges_filtered["Superstr_score"].isna().sum())
print("NaN in Substr_score:", bridges_filtered["Substr_score"].isna().sum())

NaN in Deck_score: 5
NaN in Superstr_score: 1
NaN in Substr_score: 0

# Update Score for rows with missing Deck and Superstr scores
bridges_filtered.loc[bridges_filtered["Deck_score"].isna(), "Score"] = (((bridges_filtered["Superstr_score"] +
                                                                               bridges_filtered["Substr_score"]) / 2) +
                                                                              bridges_filtered["Superstr_score"] + 
                                                                              bridges_filtered["Substr_score"]
                                                                             )
bridges_filtered.loc[bridges_filtered["Superstr_score"].isna(), "Score"] = (((bridges_filtered["Deck_score"] +
                                                                                    bridges_filtered["Substr_score"]) / 2) +
                                                                                  bridges_filtered["Deck_score"] +
                                                                                  bridges_filtered["Substr_score"]
                                                                                 )
# Filter rows where both Deck_score and Superstr_score are NaN
filtered_na = bridges_filtered[bridges_filtered["Deck_score"].isna() | bridges_filtered["Superstr_score"].isna()]
filtered_na[["Deck_score","Superstr_score","Substr_score","Score"]]

# Plot histogram of Score
plt.figure(figsize=(7, 4))
plt.hist(bridges_filtered["Score"], bins=27, alpha=0.8)
plt.xlabel("Score", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.ylabel("Frequency", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(500))
plt.grid(which="minor", axis="y", linestyle=":", linewidth=0.7)
plt.gca().xaxis.set_minor_locator(plt.MultipleLocator(1))
plt.grid(which="minor", axis="x", linestyle=":", linewidth=0.7)

plt.text(-5.3, 9820, 
         "Histogram of Score (bins=27)", 
         size = 14, weight = "bold", color = "black")
plt.text(-5.3, 9120, 
         "Distribution of bridge condition scores", 
         size = 13, color = "black")
plt.text(-5.3, -2600, 
         "RASHAD MALIK" + " " * 42 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Creating a scatter matrix
scatter_matrix = pd.plotting.scatter_matrix(
    bridges_filtered[["Age", "AverageDaily", "Trucks_percent", "Score"]],
    figsize=(12, 12),
    marker="o",
    alpha=0.3,
    diagonal="hist",
)

for ax in scatter_matrix.ravel():
    ax.grid(False)
    ax.set_xlabel(ax.get_xlabel(), fontsize=12, rotation=0, labelpad = 8, fontweight = "bold")
    ax.set_ylabel(ax.get_ylabel(), fontsize=12, rotation=90, labelpad = 8, fontweight = "bold")
    ax.xaxis.label.set_size(14)
    ax.yaxis.label.set_size(14)
    ax.tick_params(axis='x', rotation=0)

plt.text(-95, 78000, 
         "Scatter matrix of continuous variables", 
         size = 14, weight = "bold", color = "black")
plt.text(-95, 76000, 
         "Comparing Score, Age, AverageDaily, and Trucks_percent", 
         size = 13, color = "black")
plt.text(-95, -6300, 
         "RASHAD MALIK" + " " * 128 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Creating the heatmap with correlation coefficients
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
sns_heatmap = sns.heatmap(
    bridges_filtered[["Age", "AverageDaily", "Trucks_percent", "Score"]].corr(numeric_only=True),
    vmin=-1, vmax=1, 
    cmap="coolwarm",
    annot=True, 
    fmt=".4f",
    annot_kws={"size": 12, "color": "black"},
    linewidths=0.1,
    linecolor='grey', 
    ax=ax
)

plt.yticks(rotation=0, fontsize=9, fontweight = "bold")
plt.xticks(rotation=0, fontsize=9, fontweight = "bold")
colorbar = sns_heatmap.collections[0].colorbar
colorbar.ax.tick_params(labelsize=9, rotation=0)

plt.text(-1.1, -0.5, 
         "Heatmap with correlation coefficients", 
         size = 14, weight = "bold", color = "black")
plt.text(-1.1, -0.22, 
         "Comparing continuous variables", 
         size = 13, color = "black")
plt.text(-1.1, 4.8, 
         "RASHAD MALIK" + " " * 30 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Calculate average Total Score for each Age
age_score_avg = bridges_filtered.groupby("Age")["Score"].mean()

# Create lineplot
plt.figure(figsize=(7, 4))
age_score_avg.plot(kind="line")
plt.xlabel("Age", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.ylabel("Average score", fontsize = 12, fontweight = "bold", labelpad = 10)
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(0.2))
plt.grid(which="minor", axis="y", linestyle=":", linewidth=0.7)
plt.gca().xaxis.set_minor_locator(plt.MultipleLocator(2))
plt.grid(which="minor", axis="x", linestyle=":", linewidth=0.7)

plt.text(-7, 25, 
         "Line-plot", 
         size = 14, weight = "bold", color = "black")
plt.text(-7, 24.5, 
         "Average bridge score by age", 
         size = 13, color = "black")
plt.text(-7, 16.35, 
         "RASHAD MALIK" + " " * 40 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Set up the figure and subplots
fig, axes = plt.subplots(1, 2, figsize=(13, 5), sharey=False)

material_order = ["Concrete", "Steel", "Other"]
design_order = ["Beam", "Slab", "Other"]

# Violin plot for Total Score by Material
sns.violinplot(data=bridges_filtered, x="Material", y="Score", ax=axes[0], alpha=0.8, cut=0, order=material_order, color=colors[4])
axes[0].set_xlabel("Material", fontsize = 12, fontweight = "bold", labelpad = 10)
axes[0].set_ylabel("Score", fontsize = 12, fontweight = "bold", labelpad = 10)
axes[0].tick_params(axis="x", rotation=0, labelsize=10)
axes[0].yaxis.set_minor_locator(plt.MultipleLocator(1))
axes[0].grid(which="minor", axis="y", linestyle=":", linewidth=0.5)
axes[0].set_ylim(bottom=0)

# Violin plot for Total Score by Design
sns.violinplot(data=bridges_filtered, x="Design", y="Score", ax=axes[1], alpha=0.8, cut=0, order=design_order, color=colors[2])
axes[1].set_xlabel("Design", fontsize = 12, fontweight = "bold", labelpad = 10)
axes[1].set_ylabel("Score", fontsize = 12, fontweight = "bold", labelpad = 10)
axes[1].tick_params(axis="x", rotation=0, labelsize=10)
axes[1].yaxis.set_minor_locator(plt.MultipleLocator(1))
axes[1].grid(which="minor", axis="y", linestyle=":", linewidth=0.5)
axes[1].set_ylim(bottom=0)

plt.text(-4.46, 31.3, 
         "Violin plots of material and design", 
         size = 14, weight = "bold", color = "black")
plt.text(-4.46, 29.5, 
         "Visualising the distribution of categorical variables against the condition score", 
         size = 13, color = "black")
plt.text(-4.46, -6, 
         "RASHAD MALIK" + " " * 140 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Contingency table of the two categorical variables
contingency_table = pd.crosstab(bridges_filtered["Material"], bridges_filtered["Design"])

# Chi-square test calculation
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-square statistic: {chi2_stat:.4g}")
print(f"p-value: {p_value:.3g}")

Chi-square statistic: 898.1
p-value: 4.29e-193

# Create a figure with 2x2 subplots
fig, axes = plt.subplots(2, 2, figsize=(12.5, 8))

# Plot histogram for "Score"
axes[0, 0].hist(bridges_filtered["Score"], bins=20, color=colors[0])
axes[0, 0].set_xlabel("Score", fontsize = 12, fontweight = "bold", labelpad = 5)
axes[0, 0].set_ylabel("Frequency", fontsize = 12, fontweight = "bold", labelpad = 10)

# Plot histogram for "Age"
axes[0, 1].hist(bridges_filtered["Age"], bins=20, color=colors[1])
axes[0, 1].set_xlabel("Age", fontsize = 12, fontweight = "bold", labelpad = 5)
axes[0, 1].set_ylabel("Frequency", fontsize = 12, fontweight = "bold", labelpad = 10)

# Plot histogram for "AverageDaily" with logarithmic scale on y-axis
axes[1, 0].hist(bridges_filtered["AverageDaily"], bins=20, color=colors[2])
axes[1, 0].set_xlabel("Average daily traffic", fontsize = 12, fontweight = "bold", labelpad = 10)
axes[1, 0].set_ylabel("Frequency", fontsize = 12, fontweight = "bold", labelpad = 10)

# Plot histogram for "Trucks_percent" with logarithmic scale on y-axis
axes[1, 1].hist(bridges_filtered["Trucks_percent"], bins=20, color=colors[3])
axes[1, 1].set_xlabel("Truck percentage", fontsize = 12, fontweight = "bold", labelpad = 10)
axes[1, 1].set_ylabel("Frequency", fontsize = 12, fontweight = "bold", labelpad = 10)

plt.text(-155, 35000, 
         "Histograms (bins=20)", 
         size = 14, weight = "bold", color = "black")
plt.text(-155, 33700, 
         "Visualising the distribution of continuous variables", 
         size = 13, color = "black")
plt.text(-155, -5000, 
         "RASHAD MALIK" + " " * 140 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Creating dummy variables for Material and Design
material_d = pd.get_dummies(bridges_filtered.Material, drop_first=True)
design_d = pd.get_dummies(bridges_filtered.Design, drop_first=True)

# y vector: Vector of target values (Score) used in both model 1 and 2
y = bridges_filtered.Score

# Design Matrix X: Matrix of predictor values for model 1
X = np.column_stack((bridges_filtered.Age, bridges_filtered.AverageDaily, bridges_filtered.Trucks_percent,
                      material_d.Steel, material_d.Other,
                      design_d.Slab, design_d.Other))

# Design Matrix X_log: Matrix of predictor values with logarithmic transformations of skewed variables for model 2
X_log = X.copy()
X_log[:, 1] = np.log(X_log[:, 1] + 1)  # Transforming "AverageDaily"
X_log[:, 2] = np.log(X_log[:, 2] + 1)  # Transforming "Trucks_percent"

# Running the regression for model 1 and model 2
reg_model_1 = LinearRegression().fit(X, y)
reg_model_2 = LinearRegression().fit(X_log, y)

# Running regression calculations
(beta_Age1, beta_AverageDaily1, beta_Trucks_percent1,
 beta_material_steel1, beta_material_other1,
 beta_design_slab1, beta_design_other1) = reg_model_1.coef_

(beta_Age2, beta_AverageDaily2, beta_Trucks_percent2,
 beta_material_steel2, beta_material_other2,
 beta_design_slab2, beta_design_other2) = reg_model_2.coef_

# Predictions and RMSE calculations for model 1 and model 2
y_pred_model_1 = reg_model_1.predict(X)
y_pred_model_2 = reg_model_2.predict(X_log)
rmse_model_1 = np.sqrt(mean_squared_error(y, y_pred_model_1))
rmse_model_2 = np.sqrt(mean_squared_error(y, y_pred_model_2))

# Printing relevant metrics for model comparison
print("Metrics for Model 1:")
print('Model 1: The R2 coefficient of determination is %4.3f' % reg_model_1.score(X, y))
print("Model 1: RMSE is %4.4f" % rmse_model_1)
print('Model 1: Estimated coefficient for average daily use is %4.11f' % (beta_AverageDaily1), 
      "change of total score per car.")
print('Model 1: Estimated coefficient for truck percentage is %4.5f' % beta_Trucks_percent1, 
      'change of total score per percent.')

print("\nMetrics for Model 2:")
print('Model 2: The R2 coefficient of determination is %4.3f' % reg_model_2.score(X_log, y))
print("Model 2: RMSE is %4.4f" % rmse_model_2)
print('Model 2: Estimated coefficient for average daily use is %4.4f' % beta_AverageDaily2, 
      'log units.')
print('Model 2: Estimated coefficient for truck percentage is %4.4f' % beta_Trucks_percent2, 
      'log units.')

Metrics for Model 1:
Model 1: The R2 coefficient of determination is 0.459
Model 1: RMSE is 1.4186
Model 1: Estimated coefficient for average daily use is 0.00000000469 change of total score per car.
Model 1: Estimated coefficient for truck percentage is 0.00568 change of total score per percent.

Metrics for Model 2:
Model 2: The R2 coefficient of determination is 0.461
Model 2: RMSE is 1.4160
Model 2: Estimated coefficient for average daily use is 0.0133 log units.
Model 2: Estimated coefficient for truck percentage is 0.0765 log units.

# Calculating ranges with log transformation where applicable
age_range = bridges_filtered.Age.quantile(0.9) - bridges_filtered.Age.quantile(0.1)
use_range = np.log(bridges_filtered.AverageDaily.quantile(0.9) + 1) - np.log(bridges_filtered.AverageDaily.quantile(0.1) + 1)
trucks_range = np.log(bridges_filtered.Trucks_percent.quantile(0.9) + 1) - np.log(bridges_filtered.Trucks_percent.quantile(0.1) + 1)
score_range = bridges_filtered.Score.quantile(0.9) - bridges_filtered.Score.quantile(0.1)

print("Change to the total score caused by continuous variables over 10th to 90th quantiles:")
print ('Age: %4.1f percent'
       % (100 * (beta_Age2 * age_range) / score_range))

print ('Average daily usage: %4.2f percent'
       % (100 * (beta_AverageDaily2 * use_range) / score_range))

print ('Truck percentage: %4.2f percent'
       % (100 * (beta_Trucks_percent2 * trucks_range) / score_range))

print("\nChange to the total score caused by categorical variables:")
print('Steel bridges: %4.2f' % beta_material_steel2, 
      'change of total score compared to bridges made of the reference material (concrete).')
print('Bridges made of "Other" materials: %4.2f' % beta_material_other2, 
      'change of total score compared to the reference material (concrete).')
print('Slab designed bridges: %4.4f' % beta_design_slab2, 
      'change of total score compared to the reference design (beam).')
print('Bridges of "Other" designs: %4.3f' % beta_design_other2, 
      'change of total score compared to the reference design (beam).')

Change to the total score caused by continuous variables over 10th to 90th quantiles:
Age: -61.7 percent
Average daily usage: 1.66 percent
Truck percentage: 4.80 percent

Change to the total score caused by categorical variables:
Steel bridges: -1.39 change of total score compared to bridges made of the reference material (concrete).
Bridges made of "Other" materials: -2.78 change of total score compared to the reference material (concrete).
Slab designed bridges: 0.0532 change of total score compared to the reference design (beam).
Bridges of "Other" designs: 0.103 change of total score compared to the reference design (beam).

# Calculating y_hat
y_hat = reg_model_2.predict(X_log)

# Plotting residuals histogram
fig, a1 = plt.subplots(1, 1)
residuals = y_hat - y
a1.hist(residuals, bins=50, density=True)
_ = a1.set_xlabel('Error in prediction (predicted - actual total score)', fontweight = "bold", labelpad = 10)
plt.ylabel("Density", fontsize = 12, fontweight = "bold", labelpad = 10)

plt.text(-12.6, 0.38, 
         "Distribution of residuals (bins=50)", 
         size = 14, weight = "bold", color = "black")
plt.text(-12.6, 0.355, 
         "For regression model 2", 
         size = 13, color = "black")
plt.text(-12.6, -0.075, 
         "RASHAD MALIK" + " " * 32 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

plt.show()

# Calculating RMSE value
rmse = root_mean_squared_error(y, y_hat)
print("Root of the mean squared error: %.2f" % rmse)

Root of the mean squared error: 1.42

# Plotting outputs
fig, a = plt.subplots(1,1,figsize=(10,6))
a.scatter(y_hat, y,  color=colors[0], alpha=0.6)
a.plot(y_hat, y_hat, color='orange', linewidth=3)

plt.text(14.7, 30.8, 
         "Predicted scores against actual scores", 
         size = 14, weight = "bold", color = "black")
plt.text(14.7, 29, 
         "For regression model 2", 
         size = 13, color = "black")
plt.text(14.7, -6.5, 
         "RASHAD MALIK" + " " * 90 + "Source: Federal Highway Administration", 
         color = "#f0f0f0", 
         backgroundcolor = "#4d4d4d", 
         fontsize=12)

a.set_xlabel('Predicted score', fontsize = 12, fontweight = "bold", labelpad = 10)
a.set_ylabel('Actual score', fontsize = 12, fontweight = "bold", labelpad = 10)
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(1))
plt.grid(which="minor", axis="y", linestyle=":", linewidth=0.7)
plt.gca().xaxis.set_minor_locator(plt.MultipleLocator(0.2))
plt.grid(which="minor", axis="x", linestyle=":", linewidth=0.7)

plt.show()

Variable	Values
District	Each district has a unique number
Toll	Toll, Free
Maintainer	State, County, Town or City, Agency, Private, Railroad, Toll Authority, Military, Unknown
Urban	Urban, Rural
Status	Interstate, Arterial, Minor, Local
Historic	Register, Possible, Unknown, Not historic
Service_under	Other, Highway, Railroad, Pedestrian, Interchange, Building
Material	Other, Concrete, Steel, Timber, Masonry
Design	Other, Slab, Beam, Frame, Truss, Arch, Suspension, Movable, Tunnel, Culvert, Mixed
Scour_rating	Unknown, Critical, Unstable, Stable, Protected, Dry, No waterway
Deck_rating	Rating: NA, Excellent, Very Good, Good, Satisfactory, Fair, Poor, Serious, Critical, Failing, Failed
Superstr_rating	Rating
Substr_rating	Rating

	Deck_score	Superstr_score	Substr_score	Score
Structure_id
010920004518118	NaN	7.0	7.0	21.0
031690AA0273001	NaN	8.0	8.0	24.0
121020B37610001	NaN	7.0	7.0	21.0
131580AA0323001	NaN	8.0	7.0	22.5
190190102001006	NaN	7.0	7.0	21.0
211090AA0348002	1.0	NaN	4.0	7.5

Variable	Description	Categories
`Material`	The dominant material the bridge is made from.	- Concrete - Steel - Other (timber, masonry, or other materials)
`Design`	The design of the bridge.	- Beam - Slab - Other (arch, frame, truss, movable, suspension, or other bridge designs)

Statistical Analysis of Texas Bridges Using Regression Modelling¶

Project aim and outline¶

Introduction¶

Importing libraries¶

The dataset¶

Dataset description and variables¶

Data loading¶

Analysis¶

Part 1: Data preparation¶

1.1 Deriving the age of bridges¶

1.2 Excluding older bridges¶

1.3 Reducing the number of categories for `Materials` and `Design`¶

1.4 Deriving the current bridge condition¶

Part 2: Exploratory analysis¶

2.1 Continuous variables¶

2.2 Categorical variables¶

2.3 Preliminary conclusions¶

Part 3: Regression modelling¶

3.1 Preparing variables for regression¶

3.2 Calculating regression coefficients¶

3.3 Comparing regression coefficients¶

3.4 Distribution of residuals¶

3.5 Plotting predicted scores against actual scores¶

Summary and conclusion¶

References¶

Variable	Description	Type
Structure_id	Unique identifier of the bridge	String
District	Highway district in Texas responsible for bridge	category
Detour_Km	Length of detour if bridge closed	continuous
Toll	Whether a toll is paid to use bridge	category
Maintainer	The authority responsible for maintenance	category
Urban	Whether the bridge is located in an urban or rural area	category
Status	The road class: interstate to local	category
Year	The year the bridge was built	continuous
Lanes_on	The number of lanes that run over the bridge	continuous (or discrete)
Lanes_under	The number of lanes that run under the bridge	continuous (or discrete)
AverageDaily	The average daily traffic (number of vehicles)	continuous
Future_traffic	The estimated daily traffic in approx 20 years time	continuous
Trucks_percent	The percent of traffic made up of 'trucks' (i.e. lorries)	continuous
Historic	Whether the bridge is historic	category
Service_under	The (most important) service that runs under the bridge	category
Material	The dominant material the bridge is made from	category
Design	The design of the bridge	category
Spans	The number of spans the bridge has	category (or discrete)
Length	The length of the bridge in metres	continuous
Width	The width of the bridge in metres	continuous
Rated_load	The rated max loading of bridge (in tonnes)	continuous
Scour_rating	Only for bridges over water: the 'scour' condition	ordinal
Deck_rating	The condition of the deck of the bridge	ordinal
Superstr_rating	The condition of the bridge superstructure	ordinal
Substr_rating	The condition of the bridge substructure (foundations)	ordinal

Category	Value
Failed	0
Failing	1
Critical	2
Serious	3
Poor	4
Fair	5
Satisfactory	6
Good	7
Very Good	8
Excellent	9

Statistical Analysis of Texas Bridges Using Regression Modelling¶

Project aim and outline¶

Introduction¶

Importing libraries¶

The dataset¶

Dataset description and variables¶

Data loading¶

Analysis¶

Part 1: Data preparation¶

1.1 Deriving the age of bridges¶

1.2 Excluding older bridges¶

1.3 Reducing the number of categories for Materials and Design¶

1.4 Deriving the current bridge condition¶

Part 2: Exploratory analysis¶

2.1 Continuous variables¶

2.2 Categorical variables¶

2.3 Preliminary conclusions¶

Part 3: Regression modelling¶

3.1 Preparing variables for regression¶

3.2 Calculating regression coefficients¶

3.3 Comparing regression coefficients¶

3.4 Distribution of residuals¶

3.5 Plotting predicted scores against actual scores¶

Summary and conclusion¶

References¶

1.3 Reducing the number of categories for `Materials` and `Design`¶