Description statistic

2.1.4. Description statistic#

- Nomial : categorical without order
- Numbers
- Odinal: categorical with order
- Ratio
# Nomial
df_nomial = df[["Style", "NeckLine", "Material", "Pattern Type"]]
print(df_nomial.describe())
df_nomial.head(5)
         Style NeckLine Material Pattern Type
count      500      497      372          391
unique      13       16       23           14
top     Casual   o-neck   cotton        solid
freq       232      271      152          203
Style NeckLine Material Pattern Type
0 Sexy o-neck NaN animal
1 Casual o-neck microfiber animal
2 vintage o-neck polyster print
3 Brief o-neck silk print
4 cute o-neck chiffonfabric dot
df_spark[["Style", "NeckLine", "Material", "Pattern Type"]].describe().show()
+-------+-----+--------+--------+------------+
|summary|Style|NeckLine|Material|Pattern Type|
+-------+-----+--------+--------+------------+
|  count|  500|     497|     372|         391|
|   mean| null|    null|    null|        null|
| stddev| null|    null|    null|        null|
|    min|Brief|   Scoop| acrylic|      animal|
|    max| work|  v-neck|    wool|     striped|
+-------+-----+--------+--------+------------+
# Numbers
df_numbers = df[["Dress_ID"]]
print(df_numbers.describe())
df_numbers.head(5)
           Dress_ID
count  5.000000e+02
mean   9.055417e+08
std    1.736190e+08
min    4.442820e+08
25%    7.673164e+08
50%    9.083296e+08
75%    1.039534e+09
max    1.253973e+09
Dress_ID
0 1006032852
1 1212192089
2 1190380701
3 966005983
4 876339541
# Ordinal
df_ordinal = df[["Size",'Price']]
print(df_ordinal.describe())
df_ordinal.head(5)
       Size    Price
count   500      498
unique    7        7
top       M  Average
freq    177      252
Size Price
0 M Low
1 L Low
2 L High
3 L Average
4 M Low
# ratio
df_ratio = df[["Rating"]]
print(df_ratio.describe())
df_ratio.head(5)
           Rating
count  500.000000
mean     3.528600
std      2.005364
min      0.000000
25%      3.700000
50%      4.600000
75%      4.800000
max      5.000000
Rating
0 4.6
1 0.0
2 0.0
3 4.6
4 4.5
mean_rating = df_ratio["Rating"].mean()
median_rating = df_ratio["Rating"].median()
mode_rating = df_ratio["Rating"].mode().get(0)
print("Mean rating:", mean_rating)
print("Median rating:", median_rating)
print("Mode rating:", mode_rating)
Mean rating: 3.5286
Median rating: 4.6
Mode rating: 0.0
# mean
mean_rating = df_spark.agg(F.mean(df_spark.Rating)).first()[0]

# median
df_spark.createOrReplaceTempView("df_spark")
median_rating = spark.sql("""
    SELECT percentile(Rating, 0.5) AS median_rating 
    FROM df_spark
""").first()["median_rating"]

print( "Mean rating:", mean_rating)
print( "Median rating:", median_rating)
Mean rating: 3.5285999937057495
Median rating: 4.599999904632568
# range of rating
min_rating = df_ratio["Rating"].min()
max_rating = df_ratio["Rating"].max()
range_rating = max_rating - min_rating
print( "Min rating:", min_rating)
print( "Max rating:", max_rating)
print( "Rating range:", range_rating)
Min rating: 0.0
Max rating: 5.0
Rating range: 5.0
# variance of rating
var_rating = df_ratio["Rating"].var()
print( "Rating variance:", var_rating)
Rating variance: 4.02148501002004
# standard deviation of rating
std_rating = df_ratio["Rating"].std()
print( "Rating standard deviation:", std_rating)
Rating standard deviation: 2.0053640592221753