2.1.2. Read datasets#
# import all necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import SQLContext
from pyspark.sql.types import *
# reading dataset to dataframe
df = pd.read_excel(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.xlsx")
df.head()
Dress_ID | Style | Price | Rating | Size | Season | NeckLine | SleeveLength | waiseline | Material | FabricType | Decoration | Pattern Type | Recommendation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1006032852 | Sexy | Low | 4.6 | M | Summer | o-neck | sleevless | empire | NaN | chiffon | ruffles | animal | 1 |
1 | 1212192089 | Casual | Low | 0.0 | L | Summer | o-neck | Petal | natural | microfiber | NaN | ruffles | animal | 0 |
2 | 1190380701 | vintage | High | 0.0 | L | Automn | o-neck | full | natural | polyster | NaN | NaN | 0 | |
3 | 966005983 | Brief | Average | 4.6 | L | Spring | o-neck | full | natural | silk | chiffon | embroidary | 1 | |
4 | 876339541 | cute | Low | 4.5 | M | Summer | o-neck | butterfly | natural | chiffonfabric | chiffon | bow | dot | 0 |
df.to_csv(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.csv", index=False)
# read by spark
# initialize Spark
spark = SparkSession.builder.master("local").appName("Data description Spark").getOrCreate()
# reading dataset to dataframe
schema = StructType([
StructField("Dress_ID", StringType(), True),
StructField("Style", StringType(), True),
StructField("Price", StringType(), True),
StructField("Rating", FloatType(), True),
StructField("Size", StringType(), True),
StructField("Season", StringType(), True),
StructField("NeckLine", StringType(), True),
StructField("SleeveLength", StringType(), True),
StructField("waiseline", StringType(), True),
StructField("Material", StringType(), True),
StructField("FabricType", StringType(), True),
StructField("Decoration", StringType(), True),
StructField("Pattern Type", StringType(), True),
StructField("Recommendation", IntegerType(), True)])
df_spark = spark.read \
.schema(schema) \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.load(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.csv")
df_spark.show(5)
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
| Dress_ID| Style| Price|Rating|Size|Season|NeckLine|SleeveLength|waiseline| Material|FabricType|Decoration|Pattern Type|Recommendation|
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
|1006032852| Sexy| Low| 4.6| M|Summer| o-neck| sleevless| empire| null| chiffon| ruffles| animal| 1|
|1212192089| Casual| Low| 0.0| L|Summer| o-neck| Petal| natural| microfiber| null| ruffles| animal| 0|
|1190380701|vintage| High| 0.0| L|Automn| o-neck| full| natural| polyster| null| null| print| 0|
| 966005983| Brief|Average| 4.6| L|Spring| o-neck| full| natural| silk| chiffon|embroidary| print| 1|
| 876339541| cute| Low| 4.5| M|Summer| o-neck| butterfly| natural|chiffonfabric| chiffon| bow| dot| 0|
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
only showing top 5 rows