2.1.4. Description statistic#
- Nomial : categorical without order
- Numbers
- Odinal: categorical with order
- Ratio
# Nomial
df_nomial = df[["Style", "NeckLine", "Material", "Pattern Type"]]
print(df_nomial.describe())
df_nomial.head(5)
Style NeckLine Material Pattern Type
count 500 497 372 391
unique 13 16 23 14
top Casual o-neck cotton solid
freq 232 271 152 203
Style | NeckLine | Material | Pattern Type | |
---|---|---|---|---|
0 | Sexy | o-neck | NaN | animal |
1 | Casual | o-neck | microfiber | animal |
2 | vintage | o-neck | polyster | |
3 | Brief | o-neck | silk | |
4 | cute | o-neck | chiffonfabric | dot |
df_spark[["Style", "NeckLine", "Material", "Pattern Type"]].describe().show()
+-------+-----+--------+--------+------------+
|summary|Style|NeckLine|Material|Pattern Type|
+-------+-----+--------+--------+------------+
| count| 500| 497| 372| 391|
| mean| null| null| null| null|
| stddev| null| null| null| null|
| min|Brief| Scoop| acrylic| animal|
| max| work| v-neck| wool| striped|
+-------+-----+--------+--------+------------+
# Numbers
df_numbers = df[["Dress_ID"]]
print(df_numbers.describe())
df_numbers.head(5)
Dress_ID
count 5.000000e+02
mean 9.055417e+08
std 1.736190e+08
min 4.442820e+08
25% 7.673164e+08
50% 9.083296e+08
75% 1.039534e+09
max 1.253973e+09
Dress_ID | |
---|---|
0 | 1006032852 |
1 | 1212192089 |
2 | 1190380701 |
3 | 966005983 |
4 | 876339541 |
# Ordinal
df_ordinal = df[["Size",'Price']]
print(df_ordinal.describe())
df_ordinal.head(5)
Size Price
count 500 498
unique 7 7
top M Average
freq 177 252
Size | Price | |
---|---|---|
0 | M | Low |
1 | L | Low |
2 | L | High |
3 | L | Average |
4 | M | Low |
# ratio
df_ratio = df[["Rating"]]
print(df_ratio.describe())
df_ratio.head(5)
Rating
count 500.000000
mean 3.528600
std 2.005364
min 0.000000
25% 3.700000
50% 4.600000
75% 4.800000
max 5.000000
Rating | |
---|---|
0 | 4.6 |
1 | 0.0 |
2 | 0.0 |
3 | 4.6 |
4 | 4.5 |
mean_rating = df_ratio["Rating"].mean()
median_rating = df_ratio["Rating"].median()
mode_rating = df_ratio["Rating"].mode().get(0)
print("Mean rating:", mean_rating)
print("Median rating:", median_rating)
print("Mode rating:", mode_rating)
Mean rating: 3.5286
Median rating: 4.6
Mode rating: 0.0
# mean
mean_rating = df_spark.agg(F.mean(df_spark.Rating)).first()[0]
# median
df_spark.createOrReplaceTempView("df_spark")
median_rating = spark.sql("""
SELECT percentile(Rating, 0.5) AS median_rating
FROM df_spark
""").first()["median_rating"]
print( "Mean rating:", mean_rating)
print( "Median rating:", median_rating)
Mean rating: 3.5285999937057495
Median rating: 4.599999904632568
# range of rating
min_rating = df_ratio["Rating"].min()
max_rating = df_ratio["Rating"].max()
range_rating = max_rating - min_rating
print( "Min rating:", min_rating)
print( "Max rating:", max_rating)
print( "Rating range:", range_rating)
Min rating: 0.0
Max rating: 5.0
Rating range: 5.0
# variance of rating
var_rating = df_ratio["Rating"].var()
print( "Rating variance:", var_rating)
Rating variance: 4.02148501002004
# standard deviation of rating
std_rating = df_ratio["Rating"].std()
print( "Rating standard deviation:", std_rating)
Rating standard deviation: 2.0053640592221753