import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load the Telco Customer Churn dataset from CSV file
telco_df = pd.read_csv("../data/telco_customer_churn.csv")

print(f"Dataset Loaded: {telco_df.shape[0]} rows, {telco_df.shape[1]} columns")
telco_df.head()

Dataset Loaded: 7043 rows, 21 columns

# Convert 'TotalCharges' to numeric, filling blanks with 0
telco_df['TotalCharges'] = pd.to_numeric(telco_df['TotalCharges'], errors='coerce').fillna(0)

# Convert 'SeniorCitizen' from 0/1 to 'Yes'/'No'
telco_df['SeniorCitizen'] = telco_df['SeniorCitizen'].map({0: 'No', 1: 'Yes'})

# Drop 'customerID' as it is not useful for modelling
telco_df = telco_df.drop(columns=['customerID'])

# Columns with "No internet service"
internet_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                 'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in internet_cols:
    telco_df[col] = telco_df[col].replace('No internet service', 'No')

# Column with "No phone service"
telco_df['MultipleLines'] = telco_df['MultipleLines'].replace('No phone service', 'No')

# Checking 'Churn' values
print("Churn unique values:", telco_df['Churn'].unique())

Churn unique values: ['No' 'Yes']

# Encode 'Churn' as binary 0/1 for modelling
telco_df['Churn'] = telco_df['Churn'].map({'No': 0, 'Yes': 1})

# Drop duplicate rows if any
print(f"Duplicate rows: {telco_df.duplicated().sum()}")
telco_df = telco_df.drop_duplicates()

Duplicate rows: 22

print("\nData Cleaned! Info:")
telco_df.info()

Data Cleaned! Info:
<class 'pandas.core.frame.DataFrame'>
Index: 7021 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7021 non-null   object 
 1   SeniorCitizen     7021 non-null   object 
 2   Partner           7021 non-null   object 
 3   Dependents        7021 non-null   object 
 4   tenure            7021 non-null   int64  
 5   PhoneService      7021 non-null   object 
 6   MultipleLines     7021 non-null   object 
 7   InternetService   7021 non-null   object 
 8   OnlineSecurity    7021 non-null   object 
 9   OnlineBackup      7021 non-null   object 
 10  DeviceProtection  7021 non-null   object 
 11  TechSupport       7021 non-null   object 
 12  StreamingTV       7021 non-null   object 
 13  StreamingMovies   7021 non-null   object 
 14  Contract          7021 non-null   object 
 15  PaperlessBilling  7021 non-null   object 
 16  PaymentMethod     7021 non-null   object 
 17  MonthlyCharges    7021 non-null   float64
 18  TotalCharges      7021 non-null   float64
 19  Churn             7021 non-null   int64  
dtypes: float64(2), int64(2), object(16)
memory usage: 1.1+ MB

print(f"Cleaned dataset: {telco_df.shape[0]} rows, {telco_df.shape[1]} columns")
telco_df.head()

Cleaned dataset: 7021 rows, 20 columns

# Export for the Server
telco_df.to_csv("../data/clean_data.csv", index=False)
print("clean_data.csv has been saved successfully.")

clean_data.csv has been saved successfully.

def prototype_plot_distribution(column_name):
    """
    Generates a plot for a column.
    """
    plt.figure(figsize=(8, 5))
    
    # Check if column is categorical or numeric to decide plot type
    if telco_df[column_name].dtype == 'object':
        # Categorical: Use a Countplot
        sns.countplot(y=telco_df[column_name], order=telco_df[column_name].value_counts().index)
        plt.title(f"Distribution of {column_name}")
    else:
        # Numeric: Use a Histogram
        sns.histplot(telco_df[column_name], kde=True)
        plt.title(f"Distribution of {column_name}")
        
    plt.show() # In the server, this will be plt.savefig()

# Testing on "PaymentMethod", should show a bar chart
prototype_plot_distribution('PaymentMethod')

# Testing on "MonthlyCharges", should show a histogram
prototype_plot_distribution('MonthlyCharges')

def get_correlation_matrix():
    """
    Calculates Pearson correlation between numerical columns and returns a heatmap.
    """
    # Select only numeric columns
    numeric_df = telco_df.select_dtypes(include=[np.number])
    
    # Calculate correlation matrix
    corr_matrix = numeric_df.corr()
    
    # Create heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                fmt='.2f', linewidths=0.5, square=True)
    plt.title('Correlation Matrix of Numerical Features')
    plt.tight_layout()
    
    plt.show()  # In the server, this will be plt.savefig()
    
    return corr_matrix

# Generate and display the correlation matrix heatmap
corr_matrix = get_correlation_matrix()
print("\nCorrelation Matrix Values:")
display(corr_matrix)

Correlation Matrix Values:

def run_pandas_query(query_string: str):
    """
    Allows dynamic filtering of the telco dataframe using pandas query syntax.
    Example: "SeniorCitizen == 'Yes' and MonthlyCharges > 70"
    """
    try:
        # Execute the query
        filtered_df = telco_df.query(query_string)
        
        # Build summary
        summary = {
            "total_rows": len(filtered_df),
            "original_rows": len(telco_df),
            "percentage": f"{(len(filtered_df) / len(telco_df)) * 100:.2f}%"
        }
        
        print(f"Query: {query_string}")
        print(f"Results: {summary['total_rows']} rows ({summary['percentage']} of data)")
        print("-" * 50)
        
        return filtered_df
    
    except Exception as e:
        print(f"Error executing query: {e}")
        print("Tip: Use column names like 'SeniorCitizen', 'MonthlyCharges', 'Churn', etc.")
        return None

# Show all Senior Citizens who pay more than $70
result = run_pandas_query("SeniorCitizen == 'Yes' and MonthlyCharges > 70")
if result is not None:
    display(result.head(10))

Query: SeniorCitizen == 'Yes' and MonthlyCharges > 70
Results: 862 rows (12.28% of data)
--------------------------------------------------

# Global variables to store the trained model and preprocessors
churn_model = None
scaler = None
label_encoders = {}
feature_columns = None

def train_churn_model():
    """
    Trains a logistic regression model on the cleaned telco data.
    Call this on server startup to prepare the model for predictions.
    """
    global churn_model, scaler, label_encoders, feature_columns
    
    # Load the cleaned data
    df = pd.read_csv("../data/clean_data.csv")
    
    # Define features (exclude target)
    feature_columns = [col for col in df.columns if col != 'Churn']
    X = df[feature_columns].copy()
    y = df['Churn']
    
    # Encode categorical columns
    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le
    
    # Scale numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )
    
    # Train logistic regression
    churn_model = LogisticRegression(max_iter=1000, random_state=42)
    churn_model.fit(X_train, y_train)
    
    # Evaluate
    train_acc = churn_model.score(X_train, y_train)
    test_acc = churn_model.score(X_test, y_test)
    
    print(f"✅ Churn model trained successfully!")
    print(f"   Training accuracy: {train_acc:.2%}")
    print(f"   Test accuracy: {test_acc:.2%}")
    
    return churn_model

def predict_churn(
    gender: str,
    senior_citizen: str,
    partner: str,
    dependents: str,
    tenure: int,
    phone_service: str,
    multiple_lines: str,
    internet_service: str,
    online_security: str,
    online_backup: str,
    device_protection: str,
    tech_support: str,
    streaming_tv: str,
    streaming_movies: str,
    contract: str,
    paperless_billing: str,
    payment_method: str,
    monthly_charges: float,
    total_charges: float
) -> dict:
    """
    Predicts the probability of a customer churning based on their attributes.
    
    Returns a dict with:
    - churn_probability: float (0-100%)
    - risk_level: str ('Low', 'Medium', 'High')
    - recommendation: str (business advice)
    """
    global churn_model, scaler, label_encoders, feature_columns
    
    if churn_model is None:
        return {"error": "Model not trained. Call train_churn_model() first."}
    
    # Build input dataframe
    input_data = pd.DataFrame([{
        'gender': gender,
        'SeniorCitizen': senior_citizen,
        'Partner': partner,
        'Dependents': dependents,
        'tenure': tenure,
        'PhoneService': phone_service,
        'MultipleLines': multiple_lines,
        'InternetService': internet_service,
        'OnlineSecurity': online_security,
        'OnlineBackup': online_backup,
        'DeviceProtection': device_protection,
        'TechSupport': tech_support,
        'StreamingTV': streaming_tv,
        'StreamingMovies': streaming_movies,
        'Contract': contract,
        'PaperlessBilling': paperless_billing,
        'PaymentMethod': payment_method,
        'MonthlyCharges': monthly_charges,
        'TotalCharges': total_charges
    }])
    
    # Encode categorical columns using saved encoders
    for col, le in label_encoders.items():
        if col in input_data.columns:
            try:
                input_data[col] = le.transform(input_data[col])
            except ValueError as e:
                return {"error": f"Invalid value for {col}: {input_data[col].values[0]}"}
    
    # Ensure column order matches training
    input_data = input_data[feature_columns]
    
    # Scale features
    input_scaled = scaler.transform(input_data)
    
    # Predict probability
    proba = churn_model.predict_proba(input_scaled)[0][1]
    churn_percentage = proba * 100
    
    # Determine risk level and recommendation
    if churn_percentage < 30:
        risk_level = "Low"
        recommendation = "Customer appears stable. Continue standard engagement."
    elif churn_percentage < 60:
        risk_level = "Medium"
        recommendation = "Monitor this customer. Consider proactive outreach or loyalty offers."
    else:
        risk_level = "High"
        recommendation = "High churn risk! Recommend immediate intervention: discount offers, contract upgrade incentives, or personalized retention call."
    
    return {
        "churn_probability": f"{churn_percentage:.1f}%",
        "risk_level": risk_level,
        "recommendation": recommendation
    }

# Train the model
train_churn_model()

✅ Churn model trained successfully!
   Training accuracy: 80.36%
   Test accuracy: 79.57%

LogisticRegression(max_iter=1000, random_state=42)

# Predict churn for a sample customer (High Risk Profile)
result = predict_churn(
    gender='Female',
    senior_citizen='No',
    partner='No',
    dependents='No',
    tenure=1,
    phone_service='Yes',
    multiple_lines='No',
    internet_service='Fiber optic',
    online_security='No',
    online_backup='No',
    device_protection='No',
    tech_support='No',
    streaming_tv='No',
    streaming_movies='No',
    contract='Month-to-month',
    paperless_billing='Yes',
    payment_method='Electronic check',
    monthly_charges=70.0,
    total_charges=70.0
)

print("\nPrediction Result:")
for key, value in result.items():
    print(f"{key}: {value}")

Prediction Result:
churn_probability: 65.6%
risk_level: High
recommendation: High churn risk! Recommend immediate intervention: discount offers, contract upgrade incentives, or personalized retention call.

# Predict churn for a sample customer (Low Risk Profile)
result_low = predict_churn(
    gender='Male',
    senior_citizen='No',
    partner='Yes',
    dependents='Yes',
    tenure=72,
    phone_service='Yes',
    multiple_lines='Yes',
    internet_service='No',
    online_security='No',
    online_backup='No',
    device_protection='No',
    tech_support='No',
    streaming_tv='No',
    streaming_movies='No',
    contract='Two year',
    paperless_billing='No',
    payment_method='Mailed check',
    monthly_charges=25.0,
    total_charges=1800.0
)

print("\nPrediction Result (Low Risk):")
for key, value in result_low.items():
    print(f"{key}: {value}")

Prediction Result (Low Risk):
churn_probability: 0.2%
risk_level: Low
recommendation: Customer appears stable. Continue standard engagement.

{
  "mcpServers": {
    "telco-analyst": {
      "command": "cmd.exe",
      "args": [
        "/c",
        "FULL PATH TO THE BATCH SCRIPT FILE"
      ]
    }
  }
}

	tenure	MonthlyCharges	TotalCharges	Churn
tenure	1.000000	0.245251	0.825595	-0.351508
MonthlyCharges	0.245251	1.000000	0.650653	0.194508
TotalCharges	0.825595	0.650653	1.000000	-0.197198
Churn	-0.351508	0.194508	-0.197198	1.000000

	penalty penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning:: Some penalties may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8 `penalty` was deprecated in version 1.8 and will be removed in 1.10. Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for `'penalty='elasticnet'`.	'deprecated'
	C C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.	1.0
	l1_ratio l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning:: Certain values of `l1_ratio`, i.e. some penalties, may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionchanged:: 1.8 Default value changed from None to 0.0. .. deprecated:: 1.8 `None` is deprecated and will be removed in version 1.10. Always use `l1_ratio` to specify the penalty type.	0.0
	dual dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.	False
	tol tol: float, default=1e-4 Tolerance for stopping criteria.	0.0001
	fit_intercept fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.	True
	intercept_scaling intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a "synthetic" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note:: The synthetic feature weight is subject to L1 or L2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) `intercept_scaling` has to be increased.	1
	class_weight class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17 class_weight='balanced'	None
	random_state random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.	42
	solver solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except 'liblinear' minimize the full multinomial loss, 'liblinear' will raise an error. - 'newton-cholesky' is a good choice for `n_samples` >> `n_features * n_classes`, especially with one-hot encoded categorical features with rare categories. Be aware that the memory usage of this solver has a quadratic dependency on `n_features * n_classes` because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag' and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a one-versus-rest scheme for the multiclass setting one can wrap it with the :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning:: The choice of the algorithm depends on the penalty chosen (`l1_ratio=0` for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for Elastic-Net) and on (multinomial) multiclass support: ================= ======================== ====================== solver l1_ratio multinomial multiclass ================= ======================== ====================== 'lbfgs' l1_ratio=0 yes 'liblinear' l1_ratio=1 or l1_ratio=0 no 'newton-cg' l1_ratio=0 yes 'newton-cholesky' l1_ratio=0 yes 'sag' l1_ratio=0 yes 'saga' 0<=l1_ratio<=1 yes ================= ======================== ====================== .. note:: 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from :mod:`sklearn.preprocessing`. .. seealso:: Refer to the :ref:`User Guide ` for more information regarding :class:`LogisticRegression` and more specifically the :ref:`Table ` summarizing solver/penalty supports. .. versionadded:: 0.17 Stochastic Average Gradient (SAG) descent solver. Multinomial support in version 0.18. .. versionadded:: 0.19 SAGA solver. .. versionchanged:: 0.22 The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2 newton-cholesky solver. Multinomial support in version 1.6.	'lbfgs'
	max_iter max_iter: int, default=100 Maximum number of iterations taken for the solvers to converge.	1000
	verbose verbose: int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity.	0
	warm_start warm_start: bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See :term:`the Glossary `. .. versionadded:: 0.17 warm_start to support lbfgs, newton-cg, sag, saga solvers.	False
	n_jobs n_jobs: int, default=None Does not have any effect. .. deprecated:: 1.8 `n_jobs` is deprecated in version 1.8 and will be removed in 1.10.	None

Telco Customer Churn MCP Server¶

Required libraries¶

1 Loading the dataset¶

2 Data Cleaning¶

2.1 Convert TotalCharges to Numeric and Standardise Columns¶

2.3 Verify Churn Column Values¶

2.4 Remove Duplicate Rows¶

2.5 Final Data Overview¶

3 Prototyping tools¶

3.1 Plot Distribution Tool¶

3.2 Correlation Matrix Tool¶

3.3 Pandas Query Tool¶

3.4 Regression Tool¶

4 Next Steps¶

4.1 Setting Up server.py¶

4.2 Using the Batch Script¶

4.3 Configuring Claude Desktop¶

5 Testing the MCP Server with Claude Desktop¶

5.1 Distribution Plot Test¶

5.2 Correlation Matrix Test¶

5.3 Predict Churn Risk¶

5.4 Summary¶

6 Conclusion¶

7 References¶

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes

	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
30	Female	Yes	Yes	No	71	Yes	Yes	Fiber optic	Yes	Yes	Yes	Yes	No	No	Two year	Yes	Credit card (automatic)	96.35	6766.95	0
31	Male	Yes	Yes	No	2	Yes	No	Fiber optic	No	No	Yes	No	Yes	Yes	Month-to-month	Yes	Credit card (automatic)	95.50	181.65	0
50	Female	Yes	No	No	43	Yes	Yes	Fiber optic	No	Yes	No	No	Yes	No	Month-to-month	Yes	Electronic check	90.25	3838.75	0
53	Female	Yes	Yes	No	8	Yes	Yes	Fiber optic	No	Yes	No	No	No	No	Month-to-month	Yes	Credit card (automatic)	80.65	633.30	1
54	Female	Yes	Yes	Yes	60	Yes	No	DSL	Yes	Yes	Yes	Yes	No	Yes	One year	Yes	Credit card (automatic)	74.85	4456.35	0
55	Male	Yes	No	No	18	Yes	Yes	Fiber optic	No	No	No	No	Yes	Yes	Month-to-month	Yes	Electronic check	95.45	1752.55	1
57	Male	Yes	Yes	Yes	66	Yes	Yes	Fiber optic	No	Yes	Yes	Yes	Yes	Yes	One year	Yes	Electronic check	108.45	7076.35	0
72	Male	Yes	Yes	Yes	64	Yes	Yes	Fiber optic	Yes	No	Yes	Yes	Yes	Yes	Two year	Yes	Bank transfer (automatic)	111.60	7099.00	0
75	Female	Yes	No	No	56	Yes	Yes	Fiber optic	Yes	Yes	Yes	No	Yes	Yes	One year	No	Electronic check	110.50	6139.50	0
78	Male	Yes	No	No	30	Yes	No	DSL	Yes	Yes	No	No	Yes	Yes	Month-to-month	Yes	Electronic check	74.75	2111.30	0

Telco Customer Churn MCP Server¶

Required libraries¶

1 Loading the dataset¶

2 Data Cleaning¶

2.1 Convert TotalCharges to Numeric and Standardise Columns¶

2.2 Simplify Service-Related Columns¶

2.3 Verify Churn Column Values¶

2.4 Remove Duplicate Rows¶

2.5 Final Data Overview¶

3 Prototyping tools¶

3.1 Plot Distribution Tool¶

3.2 Correlation Matrix Tool¶

3.3 Pandas Query Tool¶

3.4 Regression Tool¶

4 Next Steps¶

4.1 Setting Up server.py¶

4.2 Using the Batch Script¶

4.3 Configuring Claude Desktop¶

5 Testing the MCP Server with Claude Desktop¶

5.1 Distribution Plot Test¶

5.2 Correlation Matrix Test¶

5.3 Predict Churn Risk¶

5.4 Summary¶

6 Conclusion¶

7 References¶