# Showing that the MLEnd library is installed on our system
!pip show mlend

# If you have not got the library installed, uncomment out the line below (ensure it is version 1.0.0.4)
# !pip install mlend==1.0.0.4

Name: mlend
Version: 1.0.0.4
Summary: MLEnd Datasets
Home-page: https://MLEndDatasets.github.io
Author: Jesús Requena Carrión and Nikesh Bajaj
Author-email: nikkeshbajaj@gmail.com
License: MIT
Location: c:\programdata\anaconda3\envs\py310\lib\site-packages
Requires: joblib, matplotlib, numpy, pandas, scipy, spkit
Required-by:

# Importing the MLEnd library and functions
import mlend
from mlend import download_deception_small, deception_small_load

# Downloading the "small" deception dataset
datadir = download_deception_small(save_to='MLEnd', subset={}, verbose=1, overwrite=False)

Downloading 100 stories (audio files) from https://github.com/MLEndDatasets/Deception
100%|▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓|100\100|00100.wav
Done!

import glob

# Checking how many audio files we have
sample_path = r'MLEnd\deception\MLEndDD_stories_small\*.wav'
files = glob.glob(sample_path)
print("Number of downloaded audio files:",len(files))

Number of downloaded audio files: 100

import librosa
import matplotlib.pyplot as plt
import matplotlib.style as style

# Finding the ranges of durations for our audio files
durations = []
for i in range(len(files)):
    fs = None
    x, fs = librosa.load(files[i],sr=fs)
    duration = librosa.get_duration(y=x, sr=fs)
    durations.append(duration)

print("Shortest audio file duration (in seconds):", round(min(durations),1))
print("Longest audio file duration (in seconds):", round(max(durations),1))

# Plotting the distribution of durations
style.use("seaborn-v0_8")
plt.hist(durations, bins=10, edgecolor="black", alpha=0.7)
plt.title("Distribution of audio durations", fontsize = 12)
plt.xlabel("Duration (seconds)", fontsize = 10, fontweight = "bold")
plt.ylabel("Frequency", fontsize = 10, fontweight = "bold")
plt.show()

Shortest audio file duration (in seconds): 43.4
Longest audio file duration (in seconds): 247.6

import pandas as pd

# Loading the CSV file into a Pandas dataframe
MLEND_df = pd.read_csv(r'MLEnd\deception\MLEndDD_story_attributes_small.csv')
language_counts = MLEND_df["Language"].value_counts()
story_type_counts = MLEND_df["Story_type"].value_counts()

display(MLEND_df)
display(language_counts)
print("Number of unique languages in the dataset:", len(MLEND_df["Language"].unique()))
display(story_type_counts)

Language
English              78
Hindi                 4
Arabic                3
Chinese, Mandarin     2
Marathi               2
Bengali               1
Kannada               1
French                1
Russian               1
Portuguese            1
Spanish               1
Swahilli              1
Telugu                1
Korean                1
Cantonese             1
Italian               1
Name: count, dtype: int64

Number of unique languages in the dataset: 16

Story_type
deceptive_story    50
true_story         50
Name: count, dtype: int64

import IPython.display as ipd

# Function to trim audio evenly from the start and tail
def trim_audio_equal(audio_file):
    fs = None
    x, fs = librosa.load(audio_file,sr=fs)
    total_duration = librosa.get_duration(y=x, sr=fs)

    # Calculate the excess audio duration
    excess_duration = total_duration % 30
        
    # Trimming half of the excess from start and half from end
    trim_amount = int((excess_duration / 2) * fs)
    total_trimmed = (trim_amount * 2) / fs

    # Trim the audio
    trimmed_audio = x[trim_amount : -trim_amount]

    return trimmed_audio, total_trimmed, fs, x

file_number = 1
trimmed_audio, amount, fs, x = trim_audio_equal(files[file_number - 1])
print("Original audio from file number " + str(file_number) + ":")
display(ipd.Audio(x, rate=fs))
print("File number", file_number, "has successfully been trimmed.\n",
      round(amount, 2), "seconds removed.")
display(ipd.Audio(trimmed_audio, rate=fs))

Original audio from file number 1:

File number 1 has successfully been trimmed.
 2.17 seconds removed.

# Function to divide audio into 30 second chunks
def divide_audio_chunks(audio, fs):
    chunk_size = fs * 30
    total_chunks = len(audio) // chunk_size
    chunks = []

    for i in range(total_chunks):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunks.append(audio[start:end])
    
    return chunks

# Looping through all the audio files, trimming and dividing them into 30 second chunks
MLEND_chunks = pd.DataFrame(columns=["filename", "chunk_number", "language", "Story_type", "samples"])
trimmed_list = []

for i in range(len(files)):
    trimmed_audio, _, fs, _ = trim_audio_equal(files[i])
    chunks = divide_audio_chunks(trimmed_audio, fs)
    filename = MLEND_df["filename"].iloc[i]
    language = MLEND_df["Language"].iloc[i]
    story_type = MLEND_df["Story_type"].iloc[i]
    for chunk in range(len(chunks)):
        trimmed = {
            "filename": filename,
            "chunk_number": (chunk + 1),
            "language": language,
            "Story_type": story_type,
            "samples": chunks[chunk]
        }
        trimmed_list.append(trimmed)
    
MLEND_chunks = pd.concat([MLEND_chunks, pd.DataFrame(trimmed_list)], ignore_index=True)
display(MLEND_chunks.head())

from sklearn.model_selection import train_test_split

# Stratified split 
files_train, files_test = train_test_split(
    MLEND_df["filename"],
    test_size=0.2,
    stratify=MLEND_df["Story_type"],
    random_state=8
)

# Filter MLEND_df for training and test files
train_data = MLEND_df[MLEND_df['filename'].isin(files_train)]
test_data = MLEND_df[MLEND_df['filename'].isin(files_test)]

# Count the occurrences of each story type for training set
train_story_counts = train_data["Story_type"].value_counts()
print("Training Set:")
print("Number of true stories:", train_story_counts.get('true_story', 0))
print("Number of deceptive stories:", train_story_counts.get('deceptive_story', 0))

# Count the occurrences of each story type for test set
test_story_counts = test_data["Story_type"].value_counts()
print("\nTest Set:")
print("Number of true stories:", test_story_counts.get('true_story', 0))
print("Number of deceptive stories:", test_story_counts.get('deceptive_story', 0))

Training Set:
Number of true stories: 40
Number of deceptive stories: 40

Test Set:
Number of true stories: 10
Number of deceptive stories: 10

import numpy as np

# Frame based RMS calculation for silence
def detect_silence(audio, sr, frame_length=4096, hop_length=256, silence_threshold=0.005):
    
    # Compute RMS for each frame
    rms = librosa.feature.rms(y=audio, frame_length=frame_length, hop_length=hop_length)[0]
    silent_frames = rms < silence_threshold
    times = librosa.frames_to_time(range(len(silent_frames)), sr=sr, hop_length=hop_length)
    
    # Identify silent regions
    silent_regions = []
    start_time = None
    for i, is_silent in enumerate(silent_frames):
        if is_silent and start_time is None:
            start_time = times[i]
        elif not is_silent and start_time is not None:
            end_time = times[i]
            silent_regions.append((start_time, end_time))
            start_time = None
    
    # Handle case where audio ends in silence
    if start_time is not None:
        silent_regions.append((start_time, times[-1]))
    
    return silent_regions

# Creating a plot to visualise silenced regions
def plot_full_audio_with_silence(audio, sr, silent_regions):
    t = np.arange(len(audio)) / sr
    
    # Plot the full waveform
    plt.figure(figsize=(12, 6))
    plt.plot(t, audio, label="Audio Signal")
    
    # Highlight each silent region
    for start_time, end_time in silent_regions:
        plt.axvspan(start_time, end_time, color="red", alpha=0.3)
    
    # Add labels and legend
    plt.xlabel("Time (seconds)")
    plt.ylabel("Amplitude")
    plt.title("Full Audio Waveform with Highlighted Silent Regions")
    plt.legend(["Audio Signal", "Silent Regions"], loc="upper right")
    plt.grid()
    plt.show()

silent_regions = detect_silence(MLEND_chunks["samples"][0], fs)
plot_full_audio_with_silence(MLEND_chunks["samples"][0], fs, silent_regions)

# Changing the silence threshold
silent_regions = detect_silence(MLEND_chunks["samples"][0], fs, silence_threshold=0.001)
plot_full_audio_with_silence(MLEND_chunks["samples"][0], fs, silent_regions)
display(ipd.Audio(MLEND_chunks["samples"][0], rate=fs))

# Number of pauses
num_pauses = len(silent_regions)

# Total duration of silence
total_silence_duration = sum(end - start for start, end in silent_regions)

# Pause-to-Speech Ratio
total_audio_duration = len(x) / fs
speech_duration = total_audio_duration - total_silence_duration
pause_to_speech_ratio = total_silence_duration / speech_duration

print("Number of pauses:", num_pauses)
print("Total duration of silence:", total_silence_duration)
print("Pause to speech ratio:", pause_to_speech_ratio)

Number of pauses: 21
Total duration of silence: 5.636643990929705
Pause to speech ratio: 0.04837050009729519

# Function that extracts pitch features of an audio sample
def getPitch(x,fs,winLen=0.02):
  p = winLen*fs
  frame_length = int(2**int(p-1).bit_length())
  hop_length = frame_length//2
  f0, _, _ = librosa.pyin(y=x, fmin=80, fmax=450, sr=fs,
                                                 frame_length=frame_length,hop_length=hop_length)
  cleaned_f0 = f0[~np.isnan(f0)]

  pitch_range = np.max(cleaned_f0) - np.min(cleaned_f0)
  pitch_variability_std = np.std(cleaned_f0)

  return pitch_range, pitch_variability_std

pitch_range, pitch_variability_std = getPitch(MLEND_chunks["samples"][0],fs)
print(f"Pitch Range: {pitch_range} Hz")
print(f"Pitch Variability (StdDev): {pitch_variability_std} Hz")

Pitch Range: 360.144889454402 Hz
Pitch Variability (StdDev): 36.664964416506464 Hz

# Function that extracts all required features
def extract_features(samples, sr=44100):
    silent_regions = detect_silence(samples, sr)
    # Number of pauses
    num_pauses = len(silent_regions)
    # Total duration of silence
    total_silence_duration = sum(end - start for start, end in silent_regions)
    # Pause-to-Speech Ratio
    total_audio_duration = len(samples) / sr
    speech_duration = total_audio_duration - total_silence_duration
    pause_to_speech_ratio = total_silence_duration / speech_duration

    pitch_range, pitch_variability = getPitch(samples,sr,winLen=0.02)

    return [num_pauses, total_silence_duration, pause_to_speech_ratio, pitch_range, pitch_variability]

from tqdm import tqdm

# Function that takes the full chunk dataframe and extracts X and y arrays based on a selection
def getXy(df_full, train_selection):
    X_train, X_test, y_train, y_test, groups_train, groups_test = [], [], [], [], [], []
    
    # Iterate through rows with a progress bar
    for _, row in tqdm(df_full.iterrows(), total=len(df_full), desc="Processing rows"):
        x = row["samples"]
        yi = row["Story_type"] == "true_story"
        xi = extract_features(x)

        if row["filename"] in train_selection.values:
            # Append features and labels for training set
            X_train.append(xi)
            y_train.append(yi)
            groups_train.append(row["filename"])
        else:
            # Append features and labels for test set
            X_test.append(xi)
            y_test.append(yi)
            groups_test.append(row["filename"])

    return np.array(X_train), np.array(y_train), np.array(groups_train), np.array(X_test), np.array(y_test), np.array(groups_test)

# Applying the above function on our dataset
#X_train, y_train, groups_train, X_test, y_test, groups_test = getXy(MLEND_chunks, files_train)

# If you would like to locally save the above variables, uncomment the lines below 
# np.save("variables/X_train.npy", X_train)
# np.save("variables/y_train.npy", y_train)
# np.save("variables/groups_train.npy", groups_train)
# np.save("variables/X_test.npy", X_test)
# np.save("variables/y_test.npy", y_test)
# np.save("variables/groups_test.npy", groups_test)

# Loading saved variables
X_train = np.load("variables/X_train.npy")
y_train = np.load("variables/y_train.npy")
groups_train = np.load("variables/groups_train.npy")
X_test = np.load("variables/X_test.npy")
y_test = np.load("variables/y_test.npy")
groups_test = np.load("variables/groups_test.npy")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Defining normalised models
models = {
    "Logistic Regression": Pipeline([("scaler", StandardScaler()), ("lr", LogisticRegression(max_iter=1000))]),
    "SVC": Pipeline([("scaler", StandardScaler()), ("svc", SVC(C=1.0))]),
    "k-NN": Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=5))]),
}

from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV

# Define GroupKFold
group_kfold = GroupKFold(n_splits=5)

# Define parameter grids
param_grid_lr = {
    "lr__C": [0.01, 0.1, 1, 10, 100],
    "lr__penalty": ["l2"],
    "lr__solver": ["lbfgs", "saga"],
}

param_grid_svc = {
    "svc__C": [0.1, 1, 10, 100],
    "svc__kernel": ["linear", "rbf"],
    "svc__gamma": [0.1, 1, 10],
}

param_grid_knn = {
    "knn__n_neighbors": [3, 5, 7, 9, 11],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["minkowski"],
    "knn__p": [1, 2],
}

# Perform GridSearchCV
grid_search_lr = GridSearchCV(
    estimator=models["Logistic Regression"],
    param_grid=param_grid_lr,
    cv=group_kfold.split(X_train, y_train, groups_train),
    scoring="accuracy",
    refit=True,
    verbose=0
)

svc_grid = GridSearchCV(
    estimator=models["SVC"],
    param_grid=param_grid_svc,
    cv=group_kfold.split(X_train, y_train, groups_train),
    scoring="accuracy",
    refit=True,
    verbose=0
)

grid_search_knn = GridSearchCV(
    estimator=models["k-NN"],
    param_grid=param_grid_knn,
    cv=group_kfold.split(X_train, y_train, groups_train),
    scoring="accuracy",
    refit=True,
    verbose=0
)

# Fit GridSearchCV
grid_search_lr.fit(X_train, y_train)
svc_grid.fit(X_train, y_train)
grid_search_knn.fit(X_train, y_train)

# Output concise results
print(f"Best Parameters for Logistic Regression: {grid_search_lr.best_params_}")
print(f"Best Cross-Validation Accuracy for Logistic Regression: {grid_search_lr.best_score_:.4f}")

print(f"\nBest SVC Parameters: {svc_grid.best_params_}")
print(f"Best Cross-Validation Accuracy for SVC: {svc_grid.best_score_:.4f}")

print(f"\nBest Parameters for k-NN: {grid_search_knn.best_params_}")
print(f"Best Cross-Validation Accuracy for k-NN: {grid_search_knn.best_score_:.4f}")

c:\ProgramData\anaconda3\envs\py310\lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge

Best Parameters for Logistic Regression: {'lr__C': 1, 'lr__penalty': 'l2', 'lr__solver': 'lbfgs'}
Best Cross-Validation Accuracy for Logistic Regression: 0.3982

Best SVC Parameters: {'svc__C': 10, 'svc__gamma': 1, 'svc__kernel': 'rbf'}
Best Cross-Validation Accuracy for SVC: 0.5190

Best Parameters for k-NN: {'knn__metric': 'minkowski', 'knn__n_neighbors': 5, 'knn__p': 1, 'knn__weights': 'distance'}
Best Cross-Validation Accuracy for k-NN: 0.5042

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train the SVC model with optimal parameters
best_svc = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(C=10, gamma=1, kernel="rbf"))
])
best_svc.fit(X_train, y_train)

# Evaluate on test set
y_test_pred = best_svc.predict(X_test)

# Metrics
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

Test Accuracy: 0.6172839506172839
Confusion Matrix:
 [[27  8]
 [23 23]]
Classification Report:
               precision    recall  f1-score   support

       False       0.54      0.77      0.64        35
        True       0.74      0.50      0.60        46

    accuracy                           0.62        81
   macro avg       0.64      0.64      0.62        81
weighted avg       0.65      0.62      0.61        81

Who's fibbing? A Machine Learning Approach to Acoustic Feature Analysis¶

1 Author¶

2 Problem formulation¶

3. Methodology¶

3.1 Training Task¶

3.2 Test Task¶

3.3 Model Performance Definition¶

3.4 Other Tasks¶

4 Implemented ML prediction pipelines¶

4.1 Transformation Stage¶

4.2 Model stage¶

4.2.1 Logistic Regression¶

4.2.2 Support Vector Machines (SVM)¶

4.2.3 k-Nearest Neighbors (k-NN)¶

4.2.4 Ease of Implementation¶

4.3 Ensemble stage¶

5 Dataset¶

5.1 MLEnd library setup and data download¶

5.2 Preliminary data exploration¶

5.3 Data preparation and training/test data split¶

5.3.1 Creating 30 second chunks¶

5.3.2 Selecting samples for our training and test sets¶

5.4 Feature extraction¶

6 Experiments and results¶

7 Conclusions¶

Suggestions for Improvement¶

8 References¶

	filename	Language	Story_type
0	00001.wav	Hindi	deceptive_story
1	00002.wav	English	true_story
2	00003.wav	English	deceptive_story
3	00004.wav	Bengali	deceptive_story
4	00005.wav	English	deceptive_story
...	...	...	...
95	00096.wav	English	deceptive_story
96	00097.wav	English	true_story
97	00098.wav	English	deceptive_story
98	00099.wav	English	true_story
99	00100.wav	English	deceptive_story

	filename	chunk_number	language	Story_type	samples
0	00001.wav	1	Hindi	deceptive_story	[-0.006515503, -0.00680542, -0.006958008, -0.0...
1	00001.wav	2	Hindi	deceptive_story	[0.022506714, 0.023269653, 0.023925781, 0.0247...
2	00001.wav	3	Hindi	deceptive_story	[0.0006713867, 0.00064086914, 0.0005493164, 0....
3	00001.wav	4	Hindi	deceptive_story	[-0.0009460449, -0.0031738281, -0.0043792725, ...
4	00002.wav	1	English	true_story	[-0.009094238, -0.00982666, -0.010406494, -0.0...