# Installing the libraries with the specified version.
# uncomment and run the following line if Google Colab is being used
# !pip install scikit-learn==1.2.2 seaborn==0.13.1 matplotlib==3.7.1 numpy==1.25.2 pandas==1.5.3 yellowbrick==1.5 -q --user

# Installing the libraries with the specified version.
# uncomment and run the following lines if Jupyter Notebook is being used
# !pip install scikit-learn==1.2.2 seaborn==0.13.1 matplotlib==3.7.1 numpy==1.25.2 pandas==1.5.2 yellowbrick==1.5 -q --user
# !pip install --upgrade -q jinja2

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='darkgrid')

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# to scale the data using z-score
from sklearn.preprocessing import StandardScaler

# to compute distances
from scipy.spatial.distance import cdist, pdist

# to perform k-means clustering and compute silhouette scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# to visualize the elbow curve and silhouette scores
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

# to perform hierarchical clustering, compute cophenetic correlation, and create dendrograms
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet

# to suppress warnings
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

data = pd.read_csv('/content/drive/MyDrive/content/stock_data.csv')

print(f"There are {data.shape[0]} rows and {data.shape[1]} columns.")

There are 340 rows and 15 columns.

data.sample(n=10, random_state=1)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticker Symbol                 340 non-null    object 
 1   Security                      340 non-null    object 
 2   GICS Sector                   340 non-null    object 
 3   GICS Sub Industry             340 non-null    object 
 4   Current Price                 340 non-null    float64
 5   Price Change                  340 non-null    float64
 6   Volatility                    340 non-null    float64
 7   ROE                           340 non-null    int64  
 8   Cash Ratio                    340 non-null    int64  
 9   Net Cash Flow                 340 non-null    int64  
 10  Net Income                    340 non-null    int64  
 11  Earnings Per Share            340 non-null    float64
 12  Estimated Shares Outstanding  340 non-null    float64
 13  P/E Ratio                     340 non-null    float64
 14  P/B Ratio                     340 non-null    float64
dtypes: float64(7), int64(4), object(4)
memory usage: 40.0+ KB

data.duplicated().sum()

0

data.isna().sum()

data.describe(include='all').T

df = data.copy()

df = df.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Ticker Symbol                 340 non-null    category
 1   Security                      340 non-null    category
 2   GICS Sector                   340 non-null    category
 3   GICS Sub Industry             340 non-null    category
 4   Current Price                 340 non-null    float64 
 5   Price Change                  340 non-null    float64 
 6   Volatility                    340 non-null    float64 
 7   ROE                           340 non-null    int64   
 8   Cash Ratio                    340 non-null    int64   
 9   Net Cash Flow                 340 non-null    int64   
 10  Net Income                    340 non-null    int64   
 11  Earnings Per Share            340 non-null    float64 
 12  Estimated Shares Outstanding  340 non-null    float64 
 13  P/E Ratio                     340 non-null    float64 
 14  P/B Ratio                     340 non-null    float64 
dtypes: category(4), float64(7), int64(4)
memory usage: 58.1 KB

def histogram_boxplot(df, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=df, x=feature, ax=ax_box2, showmeans=True, color="lightblue"
    )  # boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data=df, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=df, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        df[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        df[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot

histogram_boxplot(df, 'Current Price')

histogram_boxplot(df, 'Price Change')

histogram_boxplot(df, 'Volatility')

histogram_boxplot(df, 'ROE')

histogram_boxplot(df, 'Cash Ratio')

histogram_boxplot(df, 'Net Cash Flow')

histogram_boxplot(df, 'Net Income')

histogram_boxplot(df, 'Earnings Per Share')

histogram_boxplot(df, 'Estimated Shares Outstanding')

histogram_boxplot(df, 'P/E Ratio')

histogram_boxplot(df, 'P/B Ratio')

labeled_barplot(df, 'GICS Sector', perc='true')

labeled_barplot(df, 'GICS Sub Industry')

# correlation check
# Select only numeric columns for correlation matrix
numeric_df = df.select_dtypes(include=[np.number])

# Calculate the correlation matrix on numeric columns only
corr = numeric_df.corr()

# Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='Blues', fmt='.2f')
plt.title('Correlation Matrix of Variables')
plt.show()

plt.figure(figsize=(15,8))
sns.barplot(data=df, x='Price Change', y='GICS Sector', ci=False, palette='Blues_r')
plt.show()

plt.figure(figsize=(15,8))
sns.barplot(data=df, x='Cash Ratio', y='GICS Sector', ci=False, palette='Blues_r')
plt.show()

plt.figure(figsize=(15,8))
sns.barplot(data=df, x='P/E Ratio', y='GICS Sector', ci=False, palette='Blues_r')
plt.show()

plt.figure(figsize=(15,8))
sns.barplot(data=df, x='Volatility', y='GICS Sector', ci=False, palette='Blues_r')
plt.show()

plt.figure(figsize=(15, 12))

numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

for i, variable in enumerate(numeric_columns):
    plt.subplot(3, 4, i + 1)
    plt.boxplot(df[variable], whis=1.5)
    plt.tight_layout()
    plt.title(variable)

plt.show()

num_cols = df.select_dtypes(include = np.number).columns.tolist()

scaler = StandardScaler()
subset = df[num_cols].copy()
subset_scaled = scaler.fit_transform(subset)

subset_scaled_df = pd.DataFrame(subset_scaled, columns = subset.columns)

subset_scaled_df.head()

k_means_df = subset_scaled_df.copy()

clusters = range(1, 15)
meanDistortions = []

for k in clusters:
    model = KMeans(n_clusters=k, random_state=1)
    model.fit(subset_scaled_df)
    prediction = model.predict(k_means_df)
    distortion = (
        sum(np.min(cdist(k_means_df, model.cluster_centers_, "euclidean"), axis=1))
        / k_means_df.shape[0]
    )

    meanDistortions.append(distortion)

    print("Number of Clusters:", k, "\tAverage Distortion:", distortion)

plt.plot(clusters, meanDistortions, "bx-")
plt.xlabel("k")
plt.ylabel("Average Distortion")
plt.title("Selecting k with the Elbow Method", fontsize=20)
plt.show()

Number of Clusters: 1 	Average Distortion: 2.5425069919221697
Number of Clusters: 2 	Average Distortion: 2.3862098789299604
Number of Clusters: 3 	Average Distortion: 2.33620927590848
Number of Clusters: 4 	Average Distortion: 2.219050563833442
Number of Clusters: 5 	Average Distortion: 2.133404401901685
Number of Clusters: 6 	Average Distortion: 2.081503686093715
Number of Clusters: 7 	Average Distortion: 2.0045413402786814
Number of Clusters: 8 	Average Distortion: 1.9864237824874411
Number of Clusters: 9 	Average Distortion: 1.956222103389025
Number of Clusters: 10 	Average Distortion: 1.9360473996664198
Number of Clusters: 11 	Average Distortion: 1.8615942883461607
Number of Clusters: 12 	Average Distortion: 1.8219574388532505
Number of Clusters: 13 	Average Distortion: 1.7936924742607907
Number of Clusters: 14 	Average Distortion: 1.7567842179093438

model = KMeans(random_state=1)
visualizer = KElbowVisualizer(model, k=(1, 15), timings=True)
visualizer.fit(k_means_df)  # fit the data to the visualizer
visualizer.show()  # finalize and render figure

<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

sil_score = []
cluster_list = range(2, 15)
for n_clusters in cluster_list:
    clusterer = KMeans(n_clusters=n_clusters, random_state=1)
    preds = clusterer.fit_predict((subset_scaled_df))
    score = silhouette_score(k_means_df, preds)
    sil_score.append(score)
    print("For n_clusters = {}, the silhouette score is {})".format(n_clusters, score))

plt.plot(cluster_list, sil_score)
plt.show()

For n_clusters = 2, the silhouette score is 0.45335782729503565)
For n_clusters = 3, the silhouette score is 0.40374060030338865)
For n_clusters = 4, the silhouette score is 0.4246430808437099)
For n_clusters = 5, the silhouette score is 0.4381539778147092)
For n_clusters = 6, the silhouette score is 0.40869599703024256)
For n_clusters = 7, the silhouette score is 0.1207450219233897)
For n_clusters = 8, the silhouette score is 0.3693991650696542)
For n_clusters = 9, the silhouette score is 0.35185096182499204)
For n_clusters = 10, the silhouette score is 0.32950073703610283)
For n_clusters = 11, the silhouette score is 0.1486586842527321)
For n_clusters = 12, the silhouette score is 0.15784241071085106)
For n_clusters = 13, the silhouette score is 0.15646997458716602)
For n_clusters = 14, the silhouette score is 0.16253506827999134)

model = KMeans(random_state=1)
visualizer = KElbowVisualizer(model, k=(2, 15), metric="silhouette", timings=True)
visualizer.fit(k_means_df)  # fit the data to the visualizer
visualizer.show()  # finalize and render figure

<Axes: title={'center': 'Silhouette Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='silhouette score'>

visualizer = SilhouetteVisualizer(KMeans(n_clusters=5, random_state=1))
visualizer.fit(k_means_df)
visualizer.show()

<Axes: title={'center': 'Silhouette Plot of KMeans Clustering for 340 Samples in 5 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>

visualizer = SilhouetteVisualizer(KMeans(n_clusters=6, random_state=1))
visualizer.fit(k_means_df)
visualizer.show()

<Axes: title={'center': 'Silhouette Plot of KMeans Clustering for 340 Samples in 6 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>

visualizer = SilhouetteVisualizer(KMeans(n_clusters=8, random_state=1))
visualizer.fit(k_means_df)
visualizer.show()

<Axes: title={'center': 'Silhouette Plot of KMeans Clustering for 340 Samples in 8 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>

kmeans = KMeans(n_clusters=5, random_state=1)
kmeans.fit(k_means_df)

KMeans(n_clusters=5, random_state=1)

KMeans(n_clusters=5, random_state=1)

# Creating a copy of the original data
dfK = df.copy()

# Adding Kmeans cluster labels to the original and scaled dataframes
k_means_df['KM_segments'] = kmeans.labels_
dfK['KM_segments'] = kmeans.labels_

dfK['KM_segments'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 340 entries, 0 to 339
Series name: KM_segments
Non-Null Count  Dtype
--------------  -----
340 non-null    int32
dtypes: int32(1)
memory usage: 1.5 KB

dfK.head(10)

dfK.describe()

km_cluster_profile = dfK.groupby('KM_segments').mean(numeric_only = True)

km_cluster_profile['count_in_each_segment'] = (
    dfK.groupby("KM_segments")["Security"].count().values
)

km_cluster_profile.style.highlight_max(color = 'lightgreen', axis = 0)

for cl in dfK['KM_segments'].unique():
    print('There are total of', dfK[dfK["KM_segments"] == cl]['Security'].nunique(), 'companies in cluster {}, the following companies are present:'.format(cl))
    print(dfK[dfK["KM_segments"] == cl]['Security'].unique())
    print("=" * 100, "\n")
    print()

There are total of 276 companies in cluster 0, the following companies are present:
['American Airlines Group', 'AbbVie', 'Abbott Laboratories', 'Adobe Systems Inc', 'Archer-Daniels-Midland Co', ..., 'Xylem Inc.', 'Yum! Brands Inc', 'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis']
Length: 276
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
==================================================================================================== 


There are total of 26 companies in cluster 4, the following companies are present:
['Analog Devices, Inc.', 'Alliance Data Systems', 'Alexion Pharmaceuticals', 'Amgen Inc', 'Amazon.com Inc', ..., 'TripAdvisor', 'Vertex Pharmaceuticals Inc', 'Waters Corporation', 'Wynn Resorts Ltd', 'Yahoo Inc.']
Length: 26
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
==================================================================================================== 


There are total of 2 companies in cluster 2, the following companies are present:
['Apache Corporation', 'Chesapeake Energy']
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
==================================================================================================== 


There are total of 27 companies in cluster 1, the following companies are present:
['Anadarko Petroleum Corp', 'Baker Hughes Inc', 'Cabot Oil & Gas', 'Concho Resources', 'Devon Energy Corp.', ..., 'Spectra Energy Corp.', 'Southwestern Energy', 'Teradata Corp.', 'Williams Cos.', 'Cimarex Energy']
Length: 27
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
==================================================================================================== 


There are total of 9 companies in cluster 3, the following companies are present:
['Citigroup Inc.', 'Ford Motor', 'JPMorgan Chase & Co.', 'Coca Cola Company', 'Pfizer Inc.', 'AT&T Inc', 'Verizon Communications', 'Wells Fargo', 'Exxon Mobil Corp.']
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
====================================================================================================

dfK.groupby(["KM_segments", "GICS Sector"])["Security"].count()

plt.figure(figsize = (20,20))
plt.suptitle("Boxplot of numerical variables for each cluster")

# To select numerical columns
num_col = df.select_dtypes(include = np.number).columns.tolist()
for i, variable in enumerate(num_col):
    plt.subplot(3, 4, i + 1)
    sns.boxplot(data = dfK, x = "KM_segments", y = variable, palette = 'husl', flierprops = dict(markerfacecolor='lightcoral', marker = 'D'))

plt.tight_layout(pad = 2.0)

# Generate a color map with a unique color for each bar
num_of_bars = len(dfK.columns) - 1  # Subtract 1 for the GROUP column
cm = plt.get_cmap('tab10')
colors = [cm(1. * i / num_of_bars) for i in range(num_of_bars)]

dfK.groupby("KM_segments").mean(numeric_only = True).plot.bar(figsize = (30,15), color = colors)

<Axes: xlabel='KM_segments'>

hc_df = subset_scaled_df.copy()

# List of distance metrics
distance_metrics = ['euclidean', 'chebyshev', 'mahalanobis', 'cityblock']

# List of linkage methods
linkage_methods = ['single', 'complete', 'average', 'weighted']

high_cophenet_corr = 0
high_dm_lm = [0,0]

for dm in distance_metrics:
    for lm in linkage_methods:
        Z = linkage(hc_df, metric = dm, method = lm)
        c, coph_dists = cophenet(Z, pdist(hc_df))
        print(
            "Cophenetic correlation for {} distance and {} linkage is {}.".format(dm.capitalize(), lm, c)
        )
        if high_cophenet_corr < c:
            high_cophenet_corr = c
            high_dm_lm[0] = dm
            high_dm_lm[1] = lm


# Printing the combination of distance metric and linkage method with the highest cophenetic correlation
print('***'*50)

print(
    "Highest cophenetic correlation is {}, which is obtained with {} distance and {} linkage".format(high_cophenet_corr,
                                                                                                     high_dm_lm[0].capitalize(),
                                                                                                     high_dm_lm[1])
)

Cophenetic correlation for Euclidean distance and single linkage is 0.9232271494002922.
Cophenetic correlation for Euclidean distance and complete linkage is 0.7873280186580672.
Cophenetic correlation for Euclidean distance and average linkage is 0.9422540609560814.
Cophenetic correlation for Euclidean distance and weighted linkage is 0.8693784298129404.
Cophenetic correlation for Chebyshev distance and single linkage is 0.9062538164750717.
Cophenetic correlation for Chebyshev distance and complete linkage is 0.598891419111242.
Cophenetic correlation for Chebyshev distance and average linkage is 0.9338265528030499.
Cophenetic correlation for Chebyshev distance and weighted linkage is 0.9127355892367.
Cophenetic correlation for Mahalanobis distance and single linkage is 0.925919553052459.
Cophenetic correlation for Mahalanobis distance and complete linkage is 0.7925307202850002.
Cophenetic correlation for Mahalanobis distance and average linkage is 0.9247324030159736.
Cophenetic correlation for Mahalanobis distance and weighted linkage is 0.8708317490180428.
Cophenetic correlation for Cityblock distance and single linkage is 0.9334186366528574.
Cophenetic correlation for Cityblock distance and complete linkage is 0.7375328863205818.
Cophenetic correlation for Cityblock distance and average linkage is 0.9302145048594667.
Cophenetic correlation for Cityblock distance and weighted linkage is 0.731045513520281.
******************************************************************************************************************************************************
Highest cophenetic correlation is 0.9422540609560814, which is obtained with Euclidean distance and average linkage

# list of linkage methods
linkage_methods = ['single', 'complete', 'average', 'centroid', 'ward', 'weighted']

high_cophenet_corr = 0
high_dm_lm = [0, 0]

for lm in linkage_methods:
    Z = linkage(hc_df, metric="euclidean", method=lm)
    c, coph_dists = cophenet(Z, pdist(hc_df))
    print("Cophenetic correlation for {} linkage is {}.".format(lm, c))
    if high_cophenet_corr < c:
        high_cophenet_corr = c
        high_dm_lm[0] = "euclidean"
        high_dm_lm[1] = lm

# printing the combination of distance metric and linkage method with the highest cophenetic correlation
print('*'*100)
print(
    "Highest cophenetic correlation is {}, which is obtained with {} linkage.".format(
        high_cophenet_corr, high_dm_lm[1]
    )
)

Cophenetic correlation for single linkage is 0.9232271494002922.
Cophenetic correlation for complete linkage is 0.7873280186580672.
Cophenetic correlation for average linkage is 0.9422540609560814.
Cophenetic correlation for centroid linkage is 0.9314012446828154.
Cophenetic correlation for ward linkage is 0.7101180299865353.
Cophenetic correlation for weighted linkage is 0.8693784298129404.
****************************************************************************************************
Highest cophenetic correlation is 0.9422540609560814, which is obtained with average linkage.

# list of linkage methods
linkage_methods = ['single', 'complete', 'average', 'centroid', 'ward', 'weighted']

# lists to save results of cophenetic correlation calculation
compare_cols = ["Linkage", "Cophenetic Coefficient"]
compare = []

# to create a subplot image
fig, axs = plt.subplots(len(linkage_methods), 1, figsize=(15, 30))

# We will enumerate through the list of linkage methods above
# For each linkage method, we will plot the dendrogram and calculate the cophenetic correlation
for i, method in enumerate(linkage_methods):
    Z = linkage(hc_df, metric="euclidean", method=method)

    dendrogram(Z, ax=axs[i])
    axs[i].set_title(f"Dendrogram ({method.capitalize()} Linkage)")

    coph_corr, coph_dist = cophenet(Z, pdist(hc_df))
    axs[i].annotate(
        f"Cophenetic\nCorrelation\n{coph_corr:0.2f}",
        (0.80, 0.80),
        xycoords="axes fraction",
    )
    compare.append([method, coph_corr])

# create and print a dataframe to compare cophenetic correlations for different linkage methods
df_cc = pd.DataFrame(compare, columns = compare_cols)
df_cc = df_cc.sort_values(by="Cophenetic Coefficient")
df_cc

HCmodel = AgglomerativeClustering(n_clusters = 5,  metric = 'euclidean', linkage = 'average')
HCmodel.fit(hc_df)

AgglomerativeClustering(linkage='average', n_clusters=5)

AgglomerativeClustering(linkage='average', n_clusters=5)

# creating a copy of the original data
df2 = df.copy()

# adding hierarchical cluster labels to the original and scaled dataframes
hc_df["HC_segments"] = HCmodel.labels_
df2["HC_segments"] = HCmodel.labels_

hc_cluster_profile = df2.groupby("HC_segments").mean(numeric_only = True)

hc_cluster_profile['count_in_each_segment'] = (
    df2.groupby("HC_segments")["Security"].count().values
)

hc_cluster_profile.style.highlight_max(color="lightgreen", axis=0)

## Complete the code to print the companies in each cluster
for cl in df2["HC_segments"].unique():
    print("In cluster {}, the following companies are present:".format(cl))
    print(df2[df2["HC_segments"] == cl]["Security"].unique())
    print("=" * 100, "\n")
    print()

In cluster 0, the following companies are present:
['American Airlines Group', 'AbbVie', 'Abbott Laboratories', 'Adobe Systems Inc', 'Analog Devices, Inc.', ..., 'Yahoo Inc.', 'Yum! Brands Inc', 'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis']
Length: 334
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
==================================================================================================== 


In cluster 2, the following companies are present:
['Apache Corporation', 'Chesapeake Energy']
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
==================================================================================================== 


In cluster 1, the following companies are present:
['Bank of America Corp', 'Intel Corp.']
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
==================================================================================================== 


In cluster 3, the following companies are present:
['Facebook']
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
==================================================================================================== 


In cluster 4, the following companies are present:
['Priceline.com Inc']
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
====================================================================================================

df2.groupby(["HC_segments", "GICS Sector"])["Security"].count()

plt.figure(figsize = (20,20))
plt.suptitle("Boxplot of numerical variables for each cluster")

for i, variable in enumerate(num_col):
    plt.subplot(3, 4, i+1)
    sns.boxplot(data = df2, x = "HC_segments", y = variable, palette = 'husl', flierprops = dict(markerfacecolor = 'lightcoral', marker = 'D',))

plt.tight_layout(pad = 2.0)

HCmodel = AgglomerativeClustering(n_clusters = 5, metric = 'euclidean', linkage = 'ward')
HCmodel.fit(hc_df)

AgglomerativeClustering(n_clusters=5)

AgglomerativeClustering(n_clusters=5)

# Creating a copy of the original data for Euclidean Ward
dfHCW = df.copy()

# Adding hierarchical cluster labels to the original and scaled datagrames
hc_df['HC_segments'] = HCmodel.labels_
dfHCW['HC_segments'] = HCmodel.labels_

hc_cluster_profile = dfHCW.groupby("HC_segments").mean(numeric_only = True)

hc_cluster_profile['count_in_each_segment'] = (dfHCW.groupby("HC_segments")["Security"].count().values)

hc_cluster_profile.style.highlight_max(color='lightgreen', axis = 0)

# List the names of companies in each cluster
for cl in dfHCW['HC_segments'].unique():
    print('There are total of', dfHCW[dfHCW['HC_segments'] == cl]['Security'].nunique(), 'companies in cluster {}, the following companies are present:'.format(cl))
    print(dfHCW[dfHCW['HC_segments'] == cl]['Security'].unique())
    print('*' * 100, '\n')
    print()

There are total of 287 companies in cluster 1, the following companies are present:
['American Airlines Group', 'AbbVie', 'Abbott Laboratories', 'Adobe Systems Inc', 'Analog Devices, Inc.', ..., 'Xylem Inc.', 'Yum! Brands Inc', 'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis']
Length: 287
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
**************************************************************************************************** 


There are total of 15 companies in cluster 0, the following companies are present:
['Alliance Data Systems', 'Alexion Pharmaceuticals', 'Amgen Inc', 'Amazon.com Inc', 'Chipotle Mexican Grill', ..., 'Netflix Inc.', 'Priceline.com Inc', 'Regeneron', 'Waters Corporation', 'Yahoo Inc.']
Length: 15
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
**************************************************************************************************** 


There are total of 7 companies in cluster 3, the following companies are present:
['Allegion', 'Apache Corporation', 'Chesapeake Energy', 'Charter Communications', 'Colgate-Palmolive', 'Kimberly-Clark', 'S&P Global, Inc.']
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
**************************************************************************************************** 


There are total of 22 companies in cluster 4, the following companies are present:
['Anadarko Petroleum Corp', 'Baker Hughes Inc', 'Cabot Oil & Gas', 'Concho Resources', 'Devon Energy Corp.', ..., 'Spectra Energy Corp.', 'Southwestern Energy', 'Teradata Corp.', 'Williams Cos.', 'Cimarex Energy']
Length: 22
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
**************************************************************************************************** 


There are total of 9 companies in cluster 2, the following companies are present:
['Citigroup Inc.', 'Ford Motor', 'JPMorgan Chase & Co.', 'Coca Cola Company', 'Pfizer Inc.', 'AT&T Inc', 'Verizon Communications', 'Wells Fargo', 'Exxon Mobil Corp.']
Categories (340, object): ['3M Company', 'AFLAC Inc', 'AMETEK Inc', 'AT&T Inc', ...,
                           'Zimmer Biomet Holdings', 'Zions Bancorp', 'Zoetis', 'eBay Inc.']
****************************************************************************************************

dfHCW.groupby(['HC_segments', 'GICS Sector'])['Security'].count()

plt.figure(figsize = (20,20))
plt.suptitle("Boxplot of numerical variables for each cluster with Euclidean Ward")

for i, variable in enumerate(num_col):
    plt.subplot(3, 4, i+1)
    sns.boxplot(data = dfHCW, x = 'HC_segments', y = variable, palette = 'husl', flierprops = dict(markerfacecolor = 'lightcoral', marker = 'D',))

plt.tight_layout(pad = 2.0)

# Generate a color map with a unique color for each bar
num_of_bars = len(dfHCW.columns) - 1  # Subtract 1 for the GROUP column
cm = plt.get_cmap('tab10')
colors = [cm(1. * i / num_of_bars) for i in range(num_of_bars)]

dfHCW.groupby("HC_segments").mean(numeric_only = True).plot.bar(figsize = (30,15), color = colors)

<Axes: xlabel='HC_segments'>

	Ticker Symbol	Security	GICS Sector	GICS Sub Industry	Current Price	Price Change	Volatility	ROE	Cash Ratio	Net Cash Flow	Net Income	Earnings Per Share	Estimated Shares Outstanding	P/E Ratio	P/B Ratio
102	DVN	Devon Energy Corp.	Energy	Oil & Gas Exploration & Production	32.000000	-15.478079	2.923698	205	70	830000000	-14454000000	-35.55	4.065823e+08	93.089287	1.785616
125	FB	Facebook	Information Technology	Internet Software & Services	104.660004	16.224320	1.320606	8	958	592000000	3669000000	1.31	2.800763e+09	79.893133	5.884467
11	AIV	Apartment Investment & Mgmt	Real Estate	REITs	40.029999	7.578608	1.163334	15	47	21818000	248710000	1.52	1.636250e+08	26.335526	-1.269332
248	PG	Procter & Gamble	Consumer Staples	Personal Products	79.410004	10.660538	0.806056	17	129	160383000	636056000	3.28	4.913916e+08	24.070121	-2.256747
238	OXY	Occidental Petroleum	Energy	Oil & Gas Exploration & Production	67.610001	0.865287	1.589520	32	64	-588000000	-7829000000	-10.23	7.652981e+08	93.089287	3.345102
336	YUM	Yum! Brands Inc	Consumer Discretionary	Restaurants	52.516175	-8.698917	1.478877	142	27	159000000	1293000000	2.97	4.353535e+08	17.682214	-3.838260
112	EQT	EQT Corporation	Energy	Oil & Gas Exploration & Production	52.130001	-21.253771	2.364883	2	201	523803000	85171000	0.56	1.520911e+08	93.089287	9.567952
147	HAL	Halliburton Co.	Energy	Oil & Gas Equipment & Services	34.040001	-5.101751	1.966062	4	189	7786000000	-671000000	-0.79	8.493671e+08	93.089287	17.345857
89	DFS	Discover Financial Services	Financials	Consumer Finance	53.619999	3.653584	1.159897	20	99	2288000000	2297000000	5.14	4.468872e+08	10.431906	-0.375934
173	IVZ	Invesco Ltd.	Financials	Asset Management & Custody Banks	33.480000	7.067477	1.580839	12	67	412000000	968100000	2.26	4.283628e+08	14.814159	4.218620

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
Ticker Symbol	340	340	AAL	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Security	340	340	American Airlines Group	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
GICS Sector	340	11	Industrials	53	NaN	NaN	NaN	NaN	NaN	NaN	NaN
GICS Sub Industry	340	104	Oil & Gas Exploration & Production	16	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Current Price	340.0	NaN	NaN	NaN	80.862345	98.055086	4.5	38.555	59.705	92.880001	1274.949951
Price Change	340.0	NaN	NaN	NaN	4.078194	12.006338	-47.129693	-0.939484	4.819505	10.695493	55.051683
Volatility	340.0	NaN	NaN	NaN	1.525976	0.591798	0.733163	1.134878	1.385593	1.695549	4.580042
ROE	340.0	NaN	NaN	NaN	39.597059	96.547538	1.0	9.75	15.0	27.0	917.0
Cash Ratio	340.0	NaN	NaN	NaN	70.023529	90.421331	0.0	18.0	47.0	99.0	958.0
Net Cash Flow	340.0	NaN	NaN	NaN	55537620.588235	1946365312.175789	-11208000000.0	-193906500.0	2098000.0	169810750.0	20764000000.0
Net Income	340.0	NaN	NaN	NaN	1494384602.941176	3940150279.327937	-23528000000.0	352301250.0	707336000.0	1899000000.0	24442000000.0
Earnings Per Share	340.0	NaN	NaN	NaN	2.776662	6.587779	-61.2	1.5575	2.895	4.62	50.09
Estimated Shares Outstanding	340.0	NaN	NaN	NaN	577028337.75403	845849595.417695	27672156.86	158848216.1	309675137.8	573117457.325	6159292035.0
P/E Ratio	340.0	NaN	NaN	NaN	32.612563	44.348731	2.935451	15.044653	20.819876	31.764755	528.039074
P/B Ratio	340.0	NaN	NaN	NaN	-1.718249	13.966912	-76.119077	-4.352056	-1.06717	3.917066	129.064585

	Current Price	Price Change	Volatility	ROE	Cash Ratio	Net Cash Flow	Net Income	Earnings Per Share	Estimated Shares Outstanding	P/E Ratio	P/B Ratio
0	-0.393341	0.493950	0.272749	0.989601	-0.210698	-0.339355	1.554415	1.309399	0.107863	-0.652487	-0.506653
1	-0.220837	0.355439	1.137045	0.937737	0.077269	-0.002335	0.927628	0.056755	1.250274	-0.311769	-0.504205
2	-0.367195	0.602479	-0.427007	-0.192905	-0.033488	0.454058	0.744371	0.024831	1.098021	-0.391502	0.094941
3	0.133567	0.825696	-0.284802	-0.317379	1.218059	-0.152497	-0.219816	-0.230563	-0.091622	0.947148	0.424333
4	-0.260874	-0.492636	0.296470	-0.265515	2.237018	0.133564	-0.202703	-0.374982	1.978399	3.293307	0.199196

	Ticker Symbol	Security	GICS Sector	GICS Sub Industry	Current Price	Price Change	Volatility	ROE	Cash Ratio	Net Cash Flow	Net Income	Earnings Per Share	Estimated Shares Outstanding	P/E Ratio	P/B Ratio	KM_segments
0	AAL	American Airlines Group	Industrials	Airlines	42.349998	9.999995	1.687151	135	51	-604000000	7610000000	11.39	6.681299e+08	3.718174	-8.784219	0
1	ABBV	AbbVie	Health Care	Pharmaceuticals	59.240002	8.339433	2.197887	130	77	51000000	5144000000	3.15	1.633016e+09	18.806350	-8.750068	0
2	ABT	Abbott Laboratories	Health Care	Health Care Equipment	44.910000	11.301121	1.273646	21	67	938000000	4423000000	2.94	1.504422e+09	15.275510	-0.394171	0
3	ADBE	Adobe Systems Inc	Information Technology	Application Software	93.940002	13.977195	1.357679	9	180	-240840000	629551000	1.26	4.996437e+08	74.555557	4.199651	0
4	ADI	Analog Devices, Inc.	Information Technology	Semiconductors	55.320000	-1.827858	1.701169	14	272	315120000	696878000	0.31	2.247994e+09	178.451613	1.059810	4
5	ADM	Archer-Daniels-Midland Co	Consumer Staples	Agricultural Products	36.680000	-12.017268	1.516493	10	49	-189000000	1849000000	2.99	6.183946e+08	12.267559	7.496831	0
6	ADS	Alliance Data Systems	Information Technology	Data Processing & Outsourced Services	276.570007	6.189286	1.116976	30	25	90885000	596541000	8.91	6.695185e+07	31.040405	129.064585	4
7	AEE	Ameren Corp	Utilities	MultiUtilities	43.230000	2.174424	1.124186	9	14	287000000	636000000	2.60	2.446154e+08	16.626923	-0.719497	0
8	AEP	American Electric Power	Utilities	Electric Utilities	58.270000	2.371753	1.068485	11	9	13900000	2052300000	3.13	4.218978e+08	18.456543	-3.022649	0
9	AFL	AFLAC Inc	Financials	Life & Health Insurance	59.900002	3.027181	1.048295	14	99	-308000000	2533000000	5.88	4.307823e+08	10.187075	-1.883912	0

	Current Price	Price Change	Volatility	ROE	Cash Ratio	Net Cash Flow	Net Income	Earnings Per Share	Estimated Shares Outstanding	P/E Ratio	P/B Ratio	KM_segments
count	340.000000	340.000000	340.000000	340.000000	340.000000	3.400000e+02	3.400000e+02	340.000000	3.400000e+02	340.000000	340.000000	340.000000
mean	80.862345	4.078194	1.525976	39.597059	70.023529	5.553762e+07	1.494385e+09	2.776662	5.770283e+08	32.612563	-1.718249	0.476471
std	98.055086	12.006338	0.591798	96.547538	90.421331	1.946365e+09	3.940150e+09	6.587779	8.458496e+08	44.348731	13.966912	1.158287
min	4.500000	-47.129693	0.733163	1.000000	0.000000	-1.120800e+10	-2.352800e+10	-61.200000	2.767216e+07	2.935451	-76.119077	0.000000
25%	38.555000	-0.939484	1.134878	9.750000	18.000000	-1.939065e+08	3.523012e+08	1.557500	1.588482e+08	15.044653	-4.352056	0.000000
50%	59.705000	4.819505	1.385593	15.000000	47.000000	2.098000e+06	7.073360e+08	2.895000	3.096751e+08	20.819876	-1.067170	0.000000
75%	92.880001	10.695493	1.695549	27.000000	99.000000	1.698108e+08	1.899000e+09	4.620000	5.731175e+08	31.764755	3.917066	0.000000
max	1274.949951	55.051683	4.580042	917.000000	958.000000	2.076400e+10	2.444200e+10	50.090000	6.159292e+09	528.039074	129.064585	4.000000

	Current Price	Price Change	Volatility	ROE	Cash Ratio	Net Cash Flow	Net Income	Earnings Per Share	Estimated Shares Outstanding	P/E Ratio	P/B Ratio	count_in_each_segment
KM_segments
0	72.841355	5.170873	1.381148	35.054348	53.481884	121362.318841	1548856637.681159	3.667808	441678653.226848	23.624755	-3.371505	276
1	38.194074	-15.163792	2.832583	48.148148	48.777778	-104439592.592593	-2427850333.333333	-6.262222	480223773.242222	89.543050	1.347101	27
2	24.485001	-13.351992	3.482611	802.000000	51.000000	-1292500000.000000	-19106500000.000000	-41.815000	519573983.250000	60.748608	1.565141	2
3	46.672222	5.166566	1.079367	25.000000	58.333333	-3040666666.666667	14848444444.444445	3.435556	4564959946.222222	15.596051	-6.354193	9
4	226.489357	13.425086	1.710605	25.346154	273.192308	1985390923.076923	1951357384.615385	5.905385	738326814.835385	72.627508	14.000626	26

	Linkage	Cophenetic Coefficient
4	ward	0.710118
1	complete	0.787328
5	weighted	0.869378
0	single	0.923227
3	centroid	0.931401
2	average	0.942254

	Current Price	Price Change	Volatility	ROE	Cash Ratio	Net Cash Flow	Net Income	Earnings Per Share	Estimated Shares Outstanding	P/E Ratio	P/B Ratio	count_in_each_segment
HC_segments
0	77.884243	4.105986	1.516865	35.320359	66.775449	-32825817.365269	1535255703.592814	2.903308	559027333.145509	32.437511	-1.781988	334
1	25.640000	11.237908	1.322355	12.500000	130.500000	16755500000.000000	13654000000.000000	3.295000	2791829362.100000	13.649696	1.508484	2
2	24.485001	-13.351992	3.482611	802.000000	51.000000	-1292500000.000000	-19106500000.000000	-41.815000	519573983.250000	60.748608	1.565141	2
3	104.660004	16.224320	1.320606	8.000000	958.000000	592000000.000000	3669000000.000000	1.310000	2800763359.000000	79.893133	5.884467	1
4	1274.949951	3.190527	1.268340	29.000000	184.000000	-1671386000.000000	2551360000.000000	50.090000	50935516.070000	25.453183	-1.052429	1

	Current Price	Price Change	Volatility	ROE	Cash Ratio	Net Cash Flow	Net Income	Earnings Per Share	Estimated Shares Outstanding	P/E Ratio	P/B Ratio	count_in_each_segment
HC_segments
0	326.198218	10.563242	1.642560	14.400000	309.466667	288850666.666667	864498533.333333	7.785333	544900261.301333	113.095334	19.142151	15
1	72.432035	5.255290	1.426349	25.512195	60.881533	196157425.087108	1623022236.933798	3.652840	462816085.049861	24.645506	-2.618234	287
2	46.672222	5.166566	1.079367	25.000000	58.333333	-3040666666.666667	14848444444.444445	3.435556	4564959946.222222	15.596051	-6.354193	9
3	84.355716	3.854981	1.827670	633.571429	33.571429	-568400000.000000	-4968157142.857142	-10.841429	398169036.442857	42.284541	-11.589502	7
4	36.440455	-16.073408	2.832884	57.500000	42.409091	-472834090.909091	-3161045227.272727	-8.005000	514367806.201818	85.555682	0.836839	22

Trade & Ahead¶

Problem Statement¶

Context¶

Objective¶

Data Dictionary¶

Importing necessary libraries and data¶

Loading the Dataset¶

Data Overview¶

Data Shape¶

Displaying few rows of the dataset¶

Checking data types¶

Checking for duplicates and missing values¶

Statistical summary of the dataset¶

Creating a copy of the Dataset¶

Convert all object type columns to category¶

Exploratory Data Analysis (EDA)¶

Global Functions¶

Univariate Analysis¶

Current Price¶

Price Change¶

Volatility¶

ROE¶

Cash Ratio¶

Net Cash Flow¶

Net Income¶

Earnings Per Share¶

Estimated Shares Outstanding¶

P/E Ratio¶

P/B Ratio¶

GICS Sector¶

GICS Sub Industry¶

Bivariate Analysis¶

Correlation Check¶

Stocks of which economic sector have seen the maximum price increase on average¶

How Does the Average Cash Ratio Vary Across Economic Sectors¶

How Does the P/E Ratio Vary, on Average, Across Economic Sectors?¶

How volatility varies, on average, across economic sectors?¶

Data Preprocessing¶

Data Scaling before Clustering¶

K-means Clustering¶

Elbow Plot¶

Determine k Using the Silhouette Score¶

Creating final model.¶

Cluster Profiling¶

Insights¶

Hierarchical Clustering¶

Cophenetic Correlation¶

Checking Dendograms¶

Creating model using sklearn with Euclidean and Average linkage¶

Cluster Profiling for Euclidean - Average¶

Creating Model using Euclidean distance and Ward linkage¶

Cluster Profiling for Euclidean - Ward¶

K-means vs Hierarchical Clustering¶

Actionable Insights and Recommendations¶

`Current Price`¶

`Price Change`¶

`Volatility`¶

`ROE`¶

`Cash Ratio`¶

`Net Cash Flow`¶

`Net Income`¶

`Earnings Per Share`¶

`Estimated Shares Outstanding`¶

`P/E Ratio`¶

`P/B Ratio`¶

`GICS Sector`¶

`GICS Sub Industry`¶

`Correlation Check`¶

`Stocks of which economic sector have seen the maximum price increase on average`¶

`How Does the Average Cash Ratio Vary Across Economic Sectors`¶

`How Does the P/E Ratio Vary, on Average, Across Economic Sectors?`¶

`How volatility varies, on average, across economic sectors?`¶