# Installing the libraries with the specified version.
# uncomment and run the following line if Google Colab is being used
# !pip install scikit-learn==1.2.2 seaborn==0.13.1 matplotlib==3.7.1 numpy==1.25.2 pandas==1.5.3 -q --user


# Installing the libraries with the specified version.
# uncomment and run the following lines if Jupyter Notebook is being used
# !pip install scikit-learn==1.2.2 seaborn==0.11.1 matplotlib==3.3.4 numpy==1.24.3 pandas==1.5.2 -q --user


# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# split the data into train and test
from sklearn.model_selection import train_test_split

# to build linear regression_model
from sklearn.linear_model import LinearRegression

# to check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# to build linear regression_model using statsmodels
import statsmodels.api as sm

# to compute VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# uncomment and run the following lines for Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# loading data
data = pd.read_csv('/content/drive/MyDrive/content/used_device_data.csv')


data.head()


data.shape

(3454, 15)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3454 entries, 0 to 3453
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   brand_name             3454 non-null   object 
 1   os                     3454 non-null   object 
 2   screen_size            3454 non-null   float64
 3   4g                     3454 non-null   object 
 4   5g                     3454 non-null   object 
 5   main_camera_mp         3275 non-null   float64
 6   selfie_camera_mp       3452 non-null   float64
 7   int_memory             3450 non-null   float64
 8   ram                    3450 non-null   float64
 9   battery                3448 non-null   float64
 10  weight                 3447 non-null   float64
 11  release_year           3454 non-null   int64  
 12  days_used              3454 non-null   int64  
 13  normalized_used_price  3454 non-null   float64
 14  normalized_new_price   3454 non-null   float64
dtypes: float64(9), int64(2), object(4)
memory usage: 404.9+ KB


category_col = data.select_dtypes(exclude=np.number).columns.tolist()
data[category_col] = data[category_col].astype("category")


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3454 entries, 0 to 3453
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   brand_name             3454 non-null   category
 1   os                     3454 non-null   category
 2   screen_size            3454 non-null   float64 
 3   4g                     3454 non-null   category
 4   5g                     3454 non-null   category
 5   main_camera_mp         3275 non-null   float64 
 6   selfie_camera_mp       3452 non-null   float64 
 7   int_memory             3450 non-null   float64 
 8   ram                    3450 non-null   float64 
 9   battery                3448 non-null   float64 
 10  weight                 3447 non-null   float64 
 11  release_year           3454 non-null   int64   
 12  days_used              3454 non-null   int64   
 13  normalized_used_price  3454 non-null   float64 
 14  normalized_new_price   3454 non-null   float64 
dtypes: category(4), float64(9), int64(2)
memory usage: 312.2 KB


data.describe(include='all').T


data.duplicated().sum()

0


data.isnull().sum()


# creating a copy of the data so that original data is not changed.
df = data.copy()


# function to create labeled barplots


def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 2, 6))
    else:
        plt.figure(figsize=(n + 2, 6))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n],
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot


#Histogram
sns.histplot(df, x = 'normalized_used_price', kde=True);
plt.xlabel('normalized_used_price');
plt.axvline(x=df.normalized_used_price.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.normalized_used_price.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'normalized_used_price', showmeans=True, color="violet");
plt.xlabel('normalized_used_price');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'normalized_new_price', kde=True);
plt.xlabel('normalized_new_price');
plt.axvline(x=df.normalized_new_price.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.normalized_new_price.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'normalized_new_price', showmeans=True, color="violet");
plt.xlabel('normalized_new_price');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'screen_size', kde=True);
plt.xlabel('screen_size');
plt.axvline(x=df.screen_size.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.screen_size.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'screen_size', showmeans=True, color="violet");
plt.xlabel('screen_size');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'main_camera_mp', kde=True);
plt.xlabel('main_camera_mp');
plt.axvline(x=df.main_camera_mp.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.main_camera_mp.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'main_camera_mp', showmeans=True, color="violet");
plt.xlabel('main_camera_mp');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'selfie_camera_mp', kde=True);
plt.xlabel('selfie_camera_mp');
plt.axvline(x=df.selfie_camera_mp.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.selfie_camera_mp.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'selfie_camera_mp', showmeans=True, color="violet");
plt.xlabel('selfie_camera_mp');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'int_memory', kde=True);
plt.xlabel('int_memory');
plt.axvline(x=df.int_memory.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.int_memory.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'int_memory', showmeans=True, color="violet");
plt.xlabel('int_memory');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'ram', kde=True);
plt.xlabel('ram');
plt.axvline(x=df.ram.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.ram.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'ram', showmeans=True, color="violet");
plt.xlabel('ram');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'weight', kde=True);
plt.xlabel('weight');
plt.axvline(x=df.weight.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.weight.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'weight', showmeans=True, color="violet");
plt.xlabel('weight');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'battery', kde=True);
plt.xlabel('battery');
plt.axvline(x=df.battery.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.battery.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'battery', showmeans=True, color="violet");
plt.xlabel('battery');
plt.ylabel('Count');
plt.show()


#Histogram
sns.histplot(df, x = 'days_used', kde=True);
plt.xlabel('days_used');
plt.axvline(x=df.days_used.mean(),
            color='green', ls='--',
            lw=2.5)
plt.axvline(x=df.days_used.median(),
            color='red', ls='--',
            lw=2)
plt.show()

#Boxplot
sns.boxplot(df, x = 'days_used', showmeans=True, color="violet");
plt.xlabel('days_used');
plt.ylabel('Count');
plt.show()


labeled_barplot(df, "brand_name", perc=True, n=10)


labeled_barplot(df,'os', perc=True, n=10)


labeled_barplot(df,'4g', perc=True)


labeled_barplot(df,'5g', perc=True, n=10)


labeled_barplot(df,'release_year', perc=True)


cols_list = df.select_dtypes(include=np.number).columns.tolist()
# dropping release_year as it is a temporal variable
cols_list.remove("release_year")

plt.figure(figsize=(15, 7))
sns.heatmap(
    df[cols_list].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral"
)
plt.show()


plt.figure(figsize=(15, 5))
sns.boxplot(data=df, x="brand_name", y="ram")
plt.xticks(rotation=90)
plt.show()


#Subsetting batteries larger than 4500 mAh
df_large_battery = df[df.battery > 4500]
df_large_battery.shape

(341, 15)


plt.figure(figsize=(15, 5))
sns.boxplot(df_large_battery, x='brand_name', y='weight')
plt.xticks(rotation=90)
plt.show()


df_large_screen = df[df.screen_size > 6 * 2.54]
df_large_screen.shape

(1099, 15)


labeled_barplot(df_large_screen, 'brand_name', n=15)


df_selfie_camera = df[df.selfie_camera_mp > 8]
df_selfie_camera.shape

(655, 15)


labeled_barplot(df_selfie_camera, 'brand_name', n=15, perc=True)


df_main_camera = df[df.main_camera_mp > 16]
df_main_camera.shape

(94, 15)


labeled_barplot(df_main_camera, 'brand_name', n=15, perc=True)


plt.figure(figsize=(12, 5))
sns.lineplot(df, x='release_year', y='normalized_used_price')
plt.show()


plt.figure(figsize=(10, 4))

plt.subplot(121)
sns.boxplot(data=df, x="4g", y="normalized_used_price")

plt.subplot(122)
sns.boxplot(data=df, x="5g", y="normalized_used_price")

plt.show()


df.isnull().sum()


#Making a copy of the Dataset
df_all = df.copy()


#Columns that we need to impute
cols_impute = [
    "main_camera_mp",
    "selfie_camera_mp",
    "int_memory",
    "ram",
    "battery",
    "weight",
]


#Loop over columns to fill with median
for col in cols_impute:
    df_all[col] = df_all[col].fillna(
        value=df_all.groupby(["release_year", "brand_name"])[col].transform("median")
    )


#New check to find null
df_all.isnull().sum()


cols_impute = [
    "main_camera_mp",
    "selfie_camera_mp",
    "battery",
    "weight",
]


#Loop over columns to fill with median
for col in cols_impute:
    df_all[col] = df_all[col].fillna(
        value=df_all.groupby(["brand_name"])[col].transform("median")
    )


#New check to find null
df_all.isnull().sum()


df_all["main_camera_mp"] = df_all["main_camera_mp"].fillna(df_all["main_camera_mp"].median())


# Final check looking for empty values
df_all.isnull().sum()


df_all["years_since_release"] = 2021 - df_all["release_year"]
df_all.drop("release_year", axis=1, inplace=True)
df_all["years_since_release"].describe()


# outlier detection using boxplot
num_cols = df_all.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(15, 15))

for i, variable in enumerate(num_cols):
    plt.subplot(4, 3, i + 1)
    sns.boxplot(df_all, x=variable, color='green')
    plt.tight_layout(pad=2)

plt.show()


## Complete the code to define the dependent and independent variables
X = df_all.drop(['normalized_used_price'], axis=1)
y = df_all['normalized_used_price']


print(X.head())

  brand_name       os  screen_size   4g   5g  main_camera_mp  \
0      Honor  Android        14.50  yes   no            13.0   
1      Honor  Android        17.30  yes  yes            13.0   
2      Honor  Android        16.69  yes  yes            13.0   
3      Honor  Android        25.50  yes  yes            13.0   
4      Honor  Android        15.32  yes   no            13.0   

   selfie_camera_mp  int_memory  ram  battery  weight  days_used  \
0               5.0        64.0  3.0   3020.0   146.0        127   
1              16.0       128.0  8.0   4300.0   213.0        325   
2               8.0       128.0  8.0   4200.0   213.0        162   
3               8.0        64.0  6.0   7250.0   480.0        345   
4               8.0        64.0  3.0   5000.0   185.0        293   

   normalized_new_price  years_since_release  
0              4.715100                    1  
1              5.519018                    1  
2              5.884631                    1  
3              5.630961                    1  
4              4.947837                    1


print(y.head())

0    4.307572
1    5.162097
2    5.111084
3    5.135387
4    4.389995
Name: normalized_used_price, dtype: float64


X = sm.add_constant(X)


X = pd.get_dummies(
    X,
    columns=X.select_dtypes(include=["object", "category"]).columns.tolist(),
    drop_first=True,
)
#transforming booleans into floats (1, 0)
X = X.astype(float)

X.head()


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print("Number of rows in train data =", x_train.shape[0])
print("Number of rows in test data =", x_test.shape[0])

Number of rows in train data = 2417
Number of rows in test data = 1037


olsmodel1 = sm.OLS(y_train, x_train).fit() ## Complete the code to fit OLS model
print(olsmodel1.summary())

                              OLS Regression Results                             
=================================================================================
Dep. Variable:     normalized_used_price   R-squared:                       0.845
Model:                               OLS   Adj. R-squared:                  0.842
Method:                    Least Squares   F-statistic:                     268.7
Date:                   Fri, 11 Oct 2024   Prob (F-statistic):               0.00
Time:                           20:29:34   Log-Likelihood:                 123.85
No. Observations:                   2417   AIC:                            -149.7
Df Residuals:                       2368   BIC:                             134.0
Df Model:                             48                                         
Covariance Type:               nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     1.3156      0.071     18.454      0.000       1.176       1.455
screen_size               0.0244      0.003      7.163      0.000       0.018       0.031
main_camera_mp            0.0208      0.002     13.848      0.000       0.018       0.024
selfie_camera_mp          0.0135      0.001     11.997      0.000       0.011       0.016
int_memory                0.0001   6.97e-05      1.651      0.099   -2.16e-05       0.000
ram                       0.0230      0.005      4.451      0.000       0.013       0.033
battery               -1.689e-05   7.27e-06     -2.321      0.020   -3.12e-05   -2.62e-06
weight                    0.0010      0.000      7.480      0.000       0.001       0.001
days_used              4.216e-05   3.09e-05      1.366      0.172   -1.84e-05       0.000
normalized_new_price      0.4311      0.012     35.147      0.000       0.407       0.455
years_since_release      -0.0237      0.005     -5.193      0.000      -0.033      -0.015
brand_name_Alcatel        0.0154      0.048      0.323      0.747      -0.078       0.109
brand_name_Apple         -0.0038      0.147     -0.026      0.980      -0.292       0.285
brand_name_Asus           0.0151      0.048      0.314      0.753      -0.079       0.109
brand_name_BlackBerry    -0.0300      0.070     -0.427      0.669      -0.168       0.108
brand_name_Celkon        -0.0468      0.066     -0.707      0.480      -0.177       0.083
brand_name_Coolpad        0.0209      0.073      0.287      0.774      -0.122       0.164
brand_name_Gionee         0.0448      0.058      0.775      0.438      -0.068       0.158
brand_name_Google        -0.0326      0.085     -0.385      0.700      -0.199       0.133
brand_name_HTC           -0.0130      0.048     -0.270      0.787      -0.108       0.081
brand_name_Honor          0.0317      0.049      0.644      0.520      -0.065       0.128
brand_name_Huawei        -0.0020      0.044     -0.046      0.964      -0.089       0.085
brand_name_Infinix        0.1633      0.093      1.752      0.080      -0.019       0.346
brand_name_Karbonn        0.0943      0.067      1.405      0.160      -0.037       0.226
brand_name_LG            -0.0132      0.045     -0.291      0.771      -0.102       0.076
brand_name_Lava           0.0332      0.062      0.533      0.594      -0.089       0.155
brand_name_Lenovo         0.0454      0.045      1.004      0.316      -0.043       0.134
brand_name_Meizu         -0.0129      0.056     -0.230      0.818      -0.123       0.097
brand_name_Micromax      -0.0337      0.048     -0.704      0.481      -0.128       0.060
brand_name_Microsoft      0.0952      0.088      1.078      0.281      -0.078       0.268
brand_name_Motorola      -0.0112      0.050     -0.226      0.821      -0.109       0.086
brand_name_Nokia          0.0719      0.052      1.387      0.166      -0.030       0.174
brand_name_OnePlus        0.0709      0.077      0.916      0.360      -0.081       0.223
brand_name_Oppo           0.0124      0.048      0.261      0.794      -0.081       0.106
brand_name_Others        -0.0080      0.042     -0.190      0.849      -0.091       0.075
brand_name_Panasonic      0.0563      0.056      1.008      0.314      -0.053       0.166
brand_name_Realme         0.0319      0.062      0.518      0.605      -0.089       0.153
brand_name_Samsung       -0.0313      0.043     -0.725      0.469      -0.116       0.053
brand_name_Sony          -0.0616      0.050     -1.220      0.223      -0.161       0.037
brand_name_Spice         -0.0147      0.063     -0.233      0.816      -0.139       0.109
brand_name_Vivo          -0.0154      0.048     -0.318      0.750      -0.110       0.080
brand_name_XOLO           0.0152      0.055      0.277      0.782      -0.092       0.123
brand_name_Xiaomi         0.0869      0.048      1.806      0.071      -0.007       0.181
brand_name_ZTE           -0.0057      0.047     -0.121      0.904      -0.099       0.087
os_Others                -0.0510      0.033     -1.555      0.120      -0.115       0.013
os_Windows               -0.0207      0.045     -0.459      0.646      -0.109       0.068
os_iOS                   -0.0663      0.146     -0.453      0.651      -0.354       0.221
4g_yes                    0.0528      0.016      3.326      0.001       0.022       0.084
5g_yes                   -0.0714      0.031     -2.268      0.023      -0.133      -0.010
==============================================================================
Omnibus:                      223.612   Durbin-Watson:                   1.910
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              422.275
Skew:                          -0.620   Prob(JB):                     2.01e-92
Kurtosis:                       4.630   Cond. No.                     1.78e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.78e+05. This might indicate that there are
strong multicollinearity or other numerical problems.


def adj_r2_score(predictors, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = predictors.shape[0]
    k = predictors.shape[1]
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))


# function to compute MAPE
def mape_score(targets, predictions):
    return np.mean(np.abs(targets - predictions) / targets) * 100


# function to compute different metrics to check performance of a regression model
def model_performance_regression(model, predictors, target):
    """
    Function to compute different metrics to check regression model performance

    model: regressor
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    r2 = r2_score(target, pred)  # to compute R-squared
    adjr2 = adj_r2_score(predictors, target, pred)  # to compute adjusted R-squared
    rmse = np.sqrt(mean_squared_error(target, pred))  # to compute RMSE
    mae = mean_absolute_error(target, pred)  # to compute MAE
    mape = mape_score(target, pred)  # to compute MAPE

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "RMSE": rmse,
            "MAE": mae,
            "R-squared": r2,
            "Adj. R-squared": adjr2,
            "MAPE": mape,
        },
        index=[0],
    )

    return df_perf


# checking model performance on train set (seen 70% data)
print("Training Performance\n")
olsmodel1_train_perf = model_performance_regression(olsmodel1, x_train, y_train)
olsmodel1_train_perf

Training Performance


#FUNCTION to check VIF
def checking_vif(predictors):
    vif = pd.DataFrame()
    vif["feature"] = predictors.columns
    # calculating VIF for each feature
    vif["VIF"] = [
        variance_inflation_factor(predictors.values, i)
        for i in range(len(predictors.columns))
    ]
    return vif


# Initial check of data used for training
checking_vif(x_train)


def treating_multicollinearity(predictors, target, high_vif_columns):
    """
    Checking the effect of dropping the columns showing high multicollinearity
    on model performance (adj. R-squared and RMSE)

    predictors: independent variables
    target: dependent variable
    high_vif_columns: columns having high VIF
    """
    # empty lists to store adj. R-squared and RMSE values
    adj_r2 = []
    rmse = []

    # build ols models by dropping one of the high VIF columns at a time
    # store the adjusted R-squared and RMSE in the lists defined previously
    for cols in high_vif_columns:
        # defining the new train set
        train = predictors.loc[:, ~predictors.columns.str.startswith(cols)]

        # create the model
        olsmodel = sm.OLS(target, train).fit()

        # adding adj. R-squared and RMSE to the lists
        adj_r2.append(olsmodel.rsquared_adj)
        rmse.append(np.sqrt(olsmodel.mse_resid))

    # creating a dataframe for the results
    temp = pd.DataFrame(
        {
            "col": high_vif_columns,
            "Adj. R-squared after_dropping col": adj_r2,
            "RMSE after dropping col": rmse,
        }
    ).sort_values(by="Adj. R-squared after_dropping col", ascending=False)
    temp.reset_index(drop=True, inplace=True)

    return temp


# Initial set of columns found in the fist check for MC:
col_list = ['screen_size',
            'weight',
            'years_since_release',
            'brand_name_Apple',
            'brand_name_Huawei',
            'brand_name_LG',
            'brand_name_Others',
            'brand_name_Samsung',
            'os_iOS']

#Call to evaluate MC for each of these columns
res = treating_multicollinearity(x_train, y_train, col_list)
res


col_to_drop = 'brand_name_Apple'
#After each elimination a new DS is created to avoid overwriting.
x_train2 = x_train.loc[:, ~x_train.columns.str.startswith(col_to_drop)]
x_test2 = x_test.loc[:, ~x_test.columns.str.startswith(col_to_drop)]

# Check VIF now
vif = checking_vif(x_train2)
print("VIF after dropping ", col_to_drop)
vif

VIF after dropping  brand_name_Apple


# New evaluation of the model without column removed:
col_list = ['screen_size',
            'weight',
            'years_since_release',
            'brand_name_Huawei',
            'brand_name_LG',
            'brand_name_Others',
            'brand_name_Samsung']
#Call to evaluate MC for each of these columns
res = treating_multicollinearity(x_train, y_train, col_list)
res


#After each elimination a new DS is created to avoid overwriting.
col_to_drop = 'brand_name_Huawei'
x_train3 = x_train2.loc[:, ~x_train2.columns.str.startswith(col_to_drop)]
x_test3 = x_test2.loc[:, ~x_test2.columns.str.startswith(col_to_drop)]

# Check VIF now
vif = checking_vif(x_train3)
print("VIF after dropping ", col_to_drop)
vif

VIF after dropping  brand_name_Huawei


# New evaluation of the model without column removed:
col_list = ['screen_size',
            'weight',
            'years_since_release'
            ]
res = treating_multicollinearity(x_train, y_train, col_list)
res


#After each elimination a new DS is created to avoid overwriting.
col_to_drop = 'years_since_release'
x_train4 = x_train3.loc[:, ~x_train3.columns.str.startswith(col_to_drop)]
x_test4 = x_test3.loc[:, ~x_test3.columns.str.startswith(col_to_drop)]

# Check VIF now
vif = checking_vif(x_train4)
print("VIF after dropping ", col_to_drop)
vif

VIF after dropping  years_since_release


# New evaluation of the model without column removed:
col_list = ['screen_size',
            'weight'
            ]

res = treating_multicollinearity(x_train, y_train, col_list)
res


#After each elimination a new DS is created to avoid overwriting.
col_to_drop = 'screen_size'
x_train5 = x_train4.loc[:, ~x_train4.columns.str.startswith(col_to_drop)]
x_test5 = x_test4.loc[:, ~x_test4.columns.str.startswith(col_to_drop)]

# Check VIF now
vif = checking_vif(x_train5)
print("VIF after dropping ", col_to_drop)
vif

VIF after dropping  screen_size


predictors = x_train5.copy()
cols = predictors.columns.tolist()
# setting an initial max p-value
max_p_value = 1
while len(cols) > 0:
    # defining the train set
    x_train_aux = predictors[cols]

    # fitting the model
    model = sm.OLS(y_train, x_train_aux).fit()

    # getting the p-values and the maximum p-value
    p_values = model.pvalues
    max_p_value = max(p_values)

    # name of the variable with maximum p-value
    feature_with_p_max = p_values.idxmax()

    if max_p_value > 0.05:
        cols.remove(feature_with_p_max)
    else:
        break

selected_features = cols
print(selected_features)

['const', 'main_camera_mp', 'selfie_camera_mp', 'ram', 'weight', 'days_used', 'normalized_new_price', 'brand_name_Honor', 'brand_name_Infinix', 'brand_name_Lenovo', 'brand_name_Nokia', 'brand_name_Sony', 'brand_name_Xiaomi', 'os_Others', '4g_yes']


x_train6 = x_train5[selected_features]
x_test6 = x_test5[selected_features]


olsmodel2 = sm.OLS(y_train, x_train6).fit()
print(olsmodel2.summary())

                              OLS Regression Results                             
=================================================================================
Dep. Variable:     normalized_used_price   R-squared:                       0.836
Model:                               OLS   Adj. R-squared:                  0.835
Method:                    Least Squares   F-statistic:                     875.6
Date:                   Fri, 11 Oct 2024   Prob (F-statistic):               0.00
Time:                           20:30:29   Log-Likelihood:                 57.707
No. Observations:                   2417   AIC:                            -85.41
Df Residuals:                       2402   BIC:                             1.440
Df Model:                             14                                         
Covariance Type:               nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    1.4591      0.048     30.413      0.000       1.365       1.553
main_camera_mp           0.0228      0.001     16.045      0.000       0.020       0.026
selfie_camera_mp         0.0170      0.001     16.998      0.000       0.015       0.019
ram                      0.0186      0.005      4.109      0.000       0.010       0.027
weight                   0.0017   6.02e-05     28.436      0.000       0.002       0.002
days_used            -9.318e-05   2.42e-05     -3.849      0.000      -0.000   -4.57e-05
normalized_new_price     0.4180      0.011     39.050      0.000       0.397       0.439
brand_name_Honor         0.0575      0.028      2.052      0.040       0.003       0.112
brand_name_Infinix       0.1830      0.085      2.155      0.031       0.016       0.350
brand_name_Lenovo        0.0503      0.022      2.299      0.022       0.007       0.093
brand_name_Nokia         0.0807      0.031      2.608      0.009       0.020       0.141
brand_name_Sony         -0.0682      0.031     -2.222      0.026      -0.128      -0.008
brand_name_Xiaomi        0.0937      0.026      3.602      0.000       0.043       0.145
os_Others               -0.1577      0.028     -5.548      0.000      -0.213      -0.102
4g_yes                   0.0898      0.014      6.580      0.000       0.063       0.117
==============================================================================
Omnibus:                      226.072   Durbin-Watson:                   1.915
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              416.351
Skew:                          -0.633   Prob(JB):                     3.89e-91
Kurtosis:                       4.590   Cond. No.                     1.31e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


# checking model performance on train set (seen 70% data)
print("Training Performance\n")
olsmodel2_train_perf = model_performance_regression(olsmodel2, x_train6, y_train)
olsmodel2_train_perf

Training Performance


# checking model performance on test set (seen 30% data)
print("Test Performance\n")
olsmodel2_test_perf = model_performance_regression(olsmodel2, x_test6, y_test)
olsmodel2_test_perf

Test Performance


# let us create a dataframe with actual, fitted and residual values
df_pred = pd.DataFrame()

df_pred["Actual Values"] = y_train  # actual values
df_pred["Fitted Values"] = olsmodel2.fittedvalues  # predicted values
df_pred["Residuals"] = olsmodel2.resid  # residuals

df_pred.head()


sns.residplot(
    data=df_pred, x="Fitted Values", y="Residuals", color="violet", lowess=True
)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Fitted vs Residual plot")
plt.show()


sns.histplot(data=df_pred, x='Residuals', kde=True, color='green') ## Complete the code to plot the distribution of residuals
plt.title("Normality of residuals")
plt.show()


import pylab
import scipy.stats as stats

stats.probplot(df_pred['Residuals'], dist="norm", plot=pylab) ## Complete the code check Q-Q plot
plt.show()


stats.shapiro(df_pred['Residuals'])

ShapiroResult(statistic=0.9704340065495929, pvalue=6.404618530675067e-22)


import statsmodels.stats.api as sms
from statsmodels.compat import lzip

name = ["F statistic", "p-value"]
test = sms.het_goldfeldquandt(df_pred["Residuals"], x_train6) ## Complete the code with the right train data to apply the Goldfeldquandt test
lzip(name, test)

[('F statistic', 1.0162458020411484), ('p-value', 0.39038493075871056)]


olsmodel_final = sm.OLS(y_train, x_train6).fit()
print(olsmodel_final.summary())

                              OLS Regression Results                             
=================================================================================
Dep. Variable:     normalized_used_price   R-squared:                       0.836
Model:                               OLS   Adj. R-squared:                  0.835
Method:                    Least Squares   F-statistic:                     875.6
Date:                   Fri, 11 Oct 2024   Prob (F-statistic):               0.00
Time:                           20:34:29   Log-Likelihood:                 57.707
No. Observations:                   2417   AIC:                            -85.41
Df Residuals:                       2402   BIC:                             1.440
Df Model:                             14                                         
Covariance Type:               nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    1.4591      0.048     30.413      0.000       1.365       1.553
main_camera_mp           0.0228      0.001     16.045      0.000       0.020       0.026
selfie_camera_mp         0.0170      0.001     16.998      0.000       0.015       0.019
ram                      0.0186      0.005      4.109      0.000       0.010       0.027
weight                   0.0017   6.02e-05     28.436      0.000       0.002       0.002
days_used            -9.318e-05   2.42e-05     -3.849      0.000      -0.000   -4.57e-05
normalized_new_price     0.4180      0.011     39.050      0.000       0.397       0.439
brand_name_Honor         0.0575      0.028      2.052      0.040       0.003       0.112
brand_name_Infinix       0.1830      0.085      2.155      0.031       0.016       0.350
brand_name_Lenovo        0.0503      0.022      2.299      0.022       0.007       0.093
brand_name_Nokia         0.0807      0.031      2.608      0.009       0.020       0.141
brand_name_Sony         -0.0682      0.031     -2.222      0.026      -0.128      -0.008
brand_name_Xiaomi        0.0937      0.026      3.602      0.000       0.043       0.145
os_Others               -0.1577      0.028     -5.548      0.000      -0.213      -0.102
4g_yes                   0.0898      0.014      6.580      0.000       0.063       0.117
==============================================================================
Omnibus:                      226.072   Durbin-Watson:                   1.915
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              416.351
Skew:                          -0.633   Prob(JB):                     3.89e-91
Kurtosis:                       4.590   Cond. No.                     1.31e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.31e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


# checking model performance on train set (seen 70% data)
print("Training Performance\n")
olsmodel_final_train_perf = model_performance_regression(olsmodel_final, x_train6, y_train)
olsmodel_final_train_perf

Training Performance


# checking model performance on test set (seen 30% data)
print("Test Performance\n")
olsmodel_final_test_perf = model_performance_regression(olsmodel_final, x_test6, y_test) ## Complete the code to check performance on test data
olsmodel_final_test_perf

Test Performance

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
brand_name	3454	34	Others	502	NaN	NaN	NaN	NaN	NaN	NaN	NaN
os	3454	4	Android	3214	NaN	NaN	NaN	NaN	NaN	NaN	NaN
screen_size	3454.0	NaN	NaN	NaN	13.713115	3.80528	5.08	12.7	12.83	15.34	30.71
4g	3454	2	yes	2335	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5g	3454	2	no	3302	NaN	NaN	NaN	NaN	NaN	NaN	NaN
main_camera_mp	3275.0	NaN	NaN	NaN	9.460208	4.815461	0.08	5.0	8.0	13.0	48.0
selfie_camera_mp	3452.0	NaN	NaN	NaN	6.554229	6.970372	0.0	2.0	5.0	8.0	32.0
int_memory	3450.0	NaN	NaN	NaN	54.573099	84.972371	0.01	16.0	32.0	64.0	1024.0
ram	3450.0	NaN	NaN	NaN	4.036122	1.365105	0.02	4.0	4.0	4.0	12.0
battery	3448.0	NaN	NaN	NaN	3133.402697	1299.682844	500.0	2100.0	3000.0	4000.0	9720.0
weight	3447.0	NaN	NaN	NaN	182.751871	88.413228	69.0	142.0	160.0	185.0	855.0
release_year	3454.0	NaN	NaN	NaN	2015.965258	2.298455	2013.0	2014.0	2015.5	2018.0	2020.0
days_used	3454.0	NaN	NaN	NaN	674.869716	248.580166	91.0	533.5	690.5	868.75	1094.0
normalized_used_price	3454.0	NaN	NaN	NaN	4.364712	0.588914	1.536867	4.033931	4.405133	4.7557	6.619433
normalized_new_price	3454.0	NaN	NaN	NaN	5.233107	0.683637	2.901422	4.790342	5.245892	5.673718	7.847841

	years_since_release
count	3454.000000
mean	5.034742
std	2.298455
min	1.000000
25%	3.000000
50%	5.500000
75%	7.000000
max	8.000000

	col	Adj. R-squared after_dropping col	RMSE after dropping col
0	brand_name_Apple	0.841809	0.232201
1	brand_name_Huawei	0.841808	0.232201
2	brand_name_Others	0.841806	0.232203
3	brand_name_LG	0.841803	0.232205
4	os_iOS	0.841795	0.232211
5	brand_name_Samsung	0.841774	0.232227
6	years_since_release	0.840007	0.233519
7	screen_size	0.838381	0.234703
8	weight	0.838071	0.234928

	col	Adj. R-squared after_dropping col	RMSE after dropping col
0	brand_name_Huawei	0.841808	0.232201
1	brand_name_Others	0.841806	0.232203
2	brand_name_LG	0.841803	0.232205
3	brand_name_Samsung	0.841774	0.232227
4	years_since_release	0.840007	0.233519
5	screen_size	0.838381	0.234703
6	weight	0.838071	0.234928

	Actual Values	Fitted Values	Residuals
3026	4.087488	3.855511	0.231976
1525	4.448399	4.658170	-0.209771
1128	4.315353	4.330420	-0.015068
3003	4.282068	4.201580	0.080488
2907	4.456438	4.510997	-0.054559

	brand_name	os	screen_size	4g	5g	main_camera_mp	selfie_camera_mp	int_memory	ram	battery	weight	release_year	days_used	normalized_used_price	normalized_new_price
0	Honor	Android	14.50	yes	no	13.0	5.0	64.0	3.0	3020.0	146.0	2020	127	4.307572	4.715100
1	Honor	Android	17.30	yes	yes	13.0	16.0	128.0	8.0	4300.0	213.0	2020	325	5.162097	5.519018
2	Honor	Android	16.69	yes	yes	13.0	8.0	128.0	8.0	4200.0	213.0	2020	162	5.111084	5.884631
3	Honor	Android	25.50	yes	yes	13.0	8.0	64.0	6.0	7250.0	480.0	2020	345	5.135387	5.630961
4	Honor	Android	15.32	yes	no	13.0	8.0	64.0	3.0	5000.0	185.0	2020	293	4.389995	4.947837

	feature	VIF
0	const	227.744081
1	screen_size	7.677290
2	main_camera_mp	2.285051
3	selfie_camera_mp	2.812473
4	int_memory	1.364152
5	ram	2.282352
6	battery	4.081780
7	weight	6.396749
8	days_used	2.660269
9	normalized_new_price	3.119430
10	years_since_release	4.899007
11	brand_name_Alcatel	3.405693
12	brand_name_Apple	13.057668
13	brand_name_Asus	3.332038
14	brand_name_BlackBerry	1.632378
15	brand_name_Celkon	1.774721
16	brand_name_Coolpad	1.468006
17	brand_name_Gionee	1.951272
18	brand_name_Google	1.321778
19	brand_name_HTC	3.410361
20	brand_name_Honor	3.340687
21	brand_name_Huawei	5.983852
22	brand_name_Infinix	1.283955
23	brand_name_Karbonn	1.573702
24	brand_name_LG	4.849832
25	brand_name_Lava	1.711360
26	brand_name_Lenovo	4.558941
27	brand_name_Meizu	2.179607
28	brand_name_Micromax	3.363521
29	brand_name_Microsoft	1.869751
30	brand_name_Motorola	3.274558
31	brand_name_Nokia	3.479849
32	brand_name_OnePlus	1.437034
33	brand_name_Oppo	3.971194
34	brand_name_Others	9.711034
35	brand_name_Panasonic	2.105703
36	brand_name_Realme	1.946812
37	brand_name_Samsung	7.539866
38	brand_name_Sony	2.943161
39	brand_name_Spice	1.688863
40	brand_name_Vivo	3.651437
41	brand_name_XOLO	2.138070
42	brand_name_Xiaomi	3.719689
43	brand_name_ZTE	3.797581
44	os_Others	1.859863
45	os_Windows	1.596034
46	os_iOS	11.784684
47	4g_yes	2.467681
48	5g_yes	1.813900

	feature	VIF
0	const	227.303871
1	screen_size	7.643583
2	main_camera_mp	2.284429
3	selfie_camera_mp	2.791121
4	int_memory	1.364138
5	ram	2.273686
6	battery	4.079720
7	weight	6.394014
8	days_used	2.659799
9	normalized_new_price	3.102417
10	years_since_release	4.890322
11	brand_name_Alcatel	3.230773
12	brand_name_Asus	3.145077
13	brand_name_BlackBerry	1.561121
14	brand_name_Celkon	1.731854
15	brand_name_Coolpad	1.436852
16	brand_name_Gionee	1.886381
17	brand_name_Google	1.293163
18	brand_name_HTC	3.240658
19	brand_name_Honor	3.160024
20	brand_name_Huawei	5.581714
21	brand_name_Infinix	1.265587
22	brand_name_Karbonn	1.544264
23	brand_name_LG	4.565534
24	brand_name_Lava	1.670810
25	brand_name_Lenovo	4.291839
26	brand_name_Meizu	2.092837
27	brand_name_Micromax	3.214171
28	brand_name_Microsoft	1.835669
29	brand_name_Motorola	3.109867
30	brand_name_Nokia	3.266268
31	brand_name_OnePlus	1.402445
32	brand_name_Oppo	3.762937
33	brand_name_Others	9.075554
34	brand_name_Panasonic	2.030895
35	brand_name_Realme	1.878915
36	brand_name_Samsung	6.991538
37	brand_name_Sony	2.799174
38	brand_name_Spice	1.655389
39	brand_name_Vivo	3.447853
40	brand_name_XOLO	2.069729
41	brand_name_Xiaomi	3.513125
42	brand_name_ZTE	3.604439
43	os_Others	1.739572
44	os_Windows	1.593814
45	os_iOS	1.908608
46	4g_yes	2.466689
47	5g_yes	1.805259

	feature	VIF
0	const	168.020551
1	screen_size	7.631172
2	main_camera_mp	2.284069
3	selfie_camera_mp	2.780909
4	int_memory	1.362495
5	ram	2.272754
6	battery	4.079196
7	weight	6.393234
8	days_used	2.659317
9	normalized_new_price	3.102129
10	years_since_release	4.890240
11	brand_name_Alcatel	1.434036
12	brand_name_Asus	1.367355
13	brand_name_BlackBerry	1.158290
14	brand_name_Celkon	1.257428
15	brand_name_Coolpad	1.080731
16	brand_name_Gionee	1.169453
17	brand_name_Google	1.059218
18	brand_name_HTC	1.400717
19	brand_name_Honor	1.356413
20	brand_name_Infinix	1.070724
21	brand_name_Karbonn	1.137121
22	brand_name_LG	1.611822
23	brand_name_Lava	1.145103
24	brand_name_Lenovo	1.563553
25	brand_name_Meizu	1.184730
26	brand_name_Micromax	1.480644
27	brand_name_Microsoft	1.540532
28	brand_name_Motorola	1.381692
29	brand_name_Nokia	1.713629
30	brand_name_OnePlus	1.084228
31	brand_name_Oppo	1.461835
32	brand_name_Others	2.440105
33	brand_name_Panasonic	1.188348
34	brand_name_Realme	1.196286
35	brand_name_Samsung	2.000047
36	brand_name_Sony	1.345151
37	brand_name_Spice	1.163407
38	brand_name_Vivo	1.399676
39	brand_name_XOLO	1.240841
40	brand_name_Xiaomi	1.412073
41	brand_name_ZTE	1.453068
42	os_Others	1.732993
43	os_Windows	1.593605
44	os_iOS	1.224011
45	4g_yes	2.447008
46	5g_yes	1.803835

	feature	VIF
0	const	162.699578
1	screen_size	7.358795
2	main_camera_mp	2.272393
3	selfie_camera_mp	2.483454
4	int_memory	1.346347
5	ram	2.271189
6	battery	3.956113
7	weight	6.165888
8	days_used	1.907542
9	normalized_new_price	2.899427
10	brand_name_Alcatel	1.433988
11	brand_name_Asus	1.367082
12	brand_name_BlackBerry	1.156426
13	brand_name_Celkon	1.247699
14	brand_name_Coolpad	1.080418
15	brand_name_Gionee	1.169102
16	brand_name_Google	1.054586
17	brand_name_HTC	1.399615
18	brand_name_Honor	1.356333
19	brand_name_Infinix	1.070382
20	brand_name_Karbonn	1.131710
21	brand_name_LG	1.611809
22	brand_name_Lava	1.144904
23	brand_name_Lenovo	1.561471
24	brand_name_Meizu	1.183916
25	brand_name_Micromax	1.480129
26	brand_name_Microsoft	1.536148
27	brand_name_Motorola	1.381296
28	brand_name_Nokia	1.691546
29	brand_name_OnePlus	1.084209
30	brand_name_Oppo	1.460313
31	brand_name_Others	2.439684
32	brand_name_Panasonic	1.188125
33	brand_name_Realme	1.193376
34	brand_name_Samsung	1.999382
35	brand_name_Sony	1.343304
36	brand_name_Spice	1.160672
37	brand_name_Vivo	1.399561
38	brand_name_XOLO	1.239093
39	brand_name_Xiaomi	1.411976
40	brand_name_ZTE	1.452299
41	os_Others	1.732555
42	os_Windows	1.586400
43	os_iOS	1.219086
44	4g_yes	2.102178
45	5g_yes	1.782562

	feature	VIF
0	const	143.347256
1	main_camera_mp	2.266054
2	selfie_camera_mp	2.460653
3	int_memory	1.345777
4	ram	2.271186
5	battery	3.632634
6	weight	2.969761
7	days_used	1.831285
8	normalized_new_price	2.882086
9	brand_name_Alcatel	1.432521
10	brand_name_Asus	1.364344
11	brand_name_BlackBerry	1.155008
12	brand_name_Celkon	1.247343
13	brand_name_Coolpad	1.080416
14	brand_name_Gionee	1.161469
15	brand_name_Google	1.053747
16	brand_name_HTC	1.392787
17	brand_name_Honor	1.354149
18	brand_name_Infinix	1.070307
19	brand_name_Karbonn	1.131481
20	brand_name_LG	1.601708
21	brand_name_Lava	1.144893
22	brand_name_Lenovo	1.559047
23	brand_name_Meizu	1.182831
24	brand_name_Micromax	1.477835
25	brand_name_Microsoft	1.535855
26	brand_name_Motorola	1.374494
27	brand_name_Nokia	1.685545
28	brand_name_OnePlus	1.084175
29	brand_name_Oppo	1.457448
30	brand_name_Others	2.403133
31	brand_name_Panasonic	1.188066
32	brand_name_Realme	1.192537
33	brand_name_Samsung	1.990488
34	brand_name_Sony	1.339446
35	brand_name_Spice	1.155799
36	brand_name_Vivo	1.399552
37	brand_name_XOLO	1.239079
38	brand_name_Xiaomi	1.409451
39	brand_name_ZTE	1.447452
40	os_Others	1.517309
41	os_Windows	1.586387
42	os_iOS	1.213910
43	4g_yes	2.095604
44	5g_yes	1.780867

ReCell¶

Problem Statement¶

Business Context¶

Objective¶

Data Description¶

Importing necessary libraries¶

Loading the dataset¶

Data Overview¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Distribution of normalized_used_price¶

Distribution of screen_size¶

Distribution of main_camera_mp¶

Distribution of selfie_camera_mp¶

Distribution of int_memory¶

Distribution of ram¶

Distribution of weight¶

Distribution of battery¶

Distribution of days_used¶

Distribution of brand_name¶

Distribution of os¶

Distribution of 4G¶

Distribution of 5G¶

Distribution of release_year¶

Bivariate Analysis¶

Correlation Check¶

RAM accross brands¶

Weight/Battery correlation accross brands with batteries above 4500 mAh¶

Devices available with screens bigger than 6 inches¶

Devices available with selfie cameras bigger than 8 MP¶

Devices available with back cameras bigger than 16 MP¶

Price variation for devices across years¶

Prices variation among 4G and 5G¶

Data Preprocessing¶

Missing values treatment¶

Feature engineering¶

Outlier detection and treatment¶

Preparing data for modeling¶

Process to follow¶

Model Building - Linear Regression¶

Model Performance Check¶

Checking Linear Regression Assumptions¶

TEST FOR MULTICOLLINEARITY¶

Removing Multicollinearity¶

Dropping brand_name_Apple and re-evaluating¶

Dropping brand_name_Huawei and re-evaluating¶

Dropping years_since_release and re-evaluating¶

Dropping screen_size and re-evaluating¶

Removing columns wiht high p-value¶

TEST FOR LINEARITY AND INDEPENDENCE¶

NORMALITY OF ERROR¶

TEST FOR HETEROSCEDASTICITY¶

Final Model¶

Actionable Insights and Recommendations¶

`Distribution of normalized_used_price`¶

`Distribution of screen_size`¶

`Distribution of main_camera_mp`¶

`Distribution of selfie_camera_mp`¶

`Distribution of int_memory`¶

`Distribution of ram`¶

`Distribution of weight`¶

`Distribution of battery`¶

`Distribution of days_used`¶

`Distribution of brand_name`¶

`Distribution of os`¶

`Distribution of 4G`¶

`Distribution of 5G`¶

`Distribution of release_year`¶

`Process to follow`¶

`Removing Multicollinearity`¶