Read datasets

2.1.2. Read datasets#

# import all necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import SQLContext
from pyspark.sql.types import *
# reading dataset to dataframe
df = pd.read_excel(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.xlsx")
df.head()
Dress_ID Style Price Rating Size Season NeckLine SleeveLength waiseline Material FabricType Decoration Pattern Type Recommendation
0 1006032852 Sexy Low 4.6 M Summer o-neck sleevless empire NaN chiffon ruffles animal 1
1 1212192089 Casual Low 0.0 L Summer o-neck Petal natural microfiber NaN ruffles animal 0
2 1190380701 vintage High 0.0 L Automn o-neck full natural polyster NaN NaN print 0
3 966005983 Brief Average 4.6 L Spring o-neck full natural silk chiffon embroidary print 1
4 876339541 cute Low 4.5 M Summer o-neck butterfly natural chiffonfabric chiffon bow dot 0
df.to_csv(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.csv", index=False)
# read by spark

# initialize Spark
spark = SparkSession.builder.master("local").appName("Data description Spark").getOrCreate()

# reading dataset to dataframe
schema = StructType([
    StructField("Dress_ID", StringType(), True),
    StructField("Style", StringType(), True),
    StructField("Price", StringType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Size", StringType(), True),
    StructField("Season", StringType(), True),
    StructField("NeckLine", StringType(), True),
    StructField("SleeveLength", StringType(), True),
    StructField("waiseline", StringType(), True),
    StructField("Material", StringType(), True),
    StructField("FabricType", StringType(), True),
    StructField("Decoration", StringType(), True),
    StructField("Pattern Type", StringType(), True),
    StructField("Recommendation", IntegerType(), True)])

df_spark = spark.read \
        .schema(schema) \
        .format("com.databricks.spark.csv") \
        .option("header", "true") \
        .load(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.csv")

df_spark.show(5)
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
|  Dress_ID|  Style|  Price|Rating|Size|Season|NeckLine|SleeveLength|waiseline|     Material|FabricType|Decoration|Pattern Type|Recommendation|
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
|1006032852|   Sexy|    Low|   4.6|   M|Summer|  o-neck|   sleevless|   empire|         null|   chiffon|   ruffles|      animal|             1|
|1212192089| Casual|    Low|   0.0|   L|Summer|  o-neck|       Petal|  natural|   microfiber|      null|   ruffles|      animal|             0|
|1190380701|vintage|   High|   0.0|   L|Automn|  o-neck|        full|  natural|     polyster|      null|      null|       print|             0|
| 966005983|  Brief|Average|   4.6|   L|Spring|  o-neck|        full|  natural|         silk|   chiffon|embroidary|       print|             1|
| 876339541|   cute|    Low|   4.5|   M|Summer|  o-neck|   butterfly|  natural|chiffonfabric|   chiffon|       bow|         dot|             0|
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
only showing top 5 rows