Read datasets

2.1.2. Read datasets#

# import all necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import SQLContext
from pyspark.sql.types import *

# reading dataset to dataframe
df = pd.read_excel(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.xlsx")
df.head()

	Dress_ID	Style	Price	Rating	Size	Season	NeckLine	SleeveLength	waiseline	Material	FabricType	Decoration	Pattern Type	Recommendation
0	1006032852	Sexy	Low	4.6	M	Summer	o-neck	sleevless	empire	NaN	chiffon	ruffles	animal	1
1	1212192089	Casual	Low	0.0	L	Summer	o-neck	Petal	natural	microfiber	NaN	ruffles	animal	0
2	1190380701	vintage	High	0.0	L	Automn	o-neck	full	natural	polyster	NaN	NaN	print	0
3	966005983	Brief	Average	4.6	L	Spring	o-neck	full	natural	silk	chiffon	embroidary	print	1
4	876339541	cute	Low	4.5	M	Summer	o-neck	butterfly	natural	chiffonfabric	chiffon	bow	dot	0

df.to_csv(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.csv", index=False)

# read by spark

# initialize Spark
spark = SparkSession.builder.master("local").appName("Data description Spark").getOrCreate()

# reading dataset to dataframe
schema = StructType([
    StructField("Dress_ID", StringType(), True),
    StructField("Style", StringType(), True),
    StructField("Price", StringType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Size", StringType(), True),
    StructField("Season", StringType(), True),
    StructField("NeckLine", StringType(), True),
    StructField("SleeveLength", StringType(), True),
    StructField("waiseline", StringType(), True),
    StructField("Material", StringType(), True),
    StructField("FabricType", StringType(), True),
    StructField("Decoration", StringType(), True),
    StructField("Pattern Type", StringType(), True),
    StructField("Recommendation", IntegerType(), True)])

df_spark = spark.read \
        .schema(schema) \
        .format("com.databricks.spark.csv") \
        .option("header", "true") \
        .load(r"Dataset/Dresses_Attribute_Sales/Attribute DataSet.csv")

df_spark.show(5)

+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
|  Dress_ID|  Style|  Price|Rating|Size|Season|NeckLine|SleeveLength|waiseline|     Material|FabricType|Decoration|Pattern Type|Recommendation|
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
|1006032852|   Sexy|    Low|   4.6|   M|Summer|  o-neck|   sleevless|   empire|         null|   chiffon|   ruffles|      animal|             1|
|1212192089| Casual|    Low|   0.0|   L|Summer|  o-neck|       Petal|  natural|   microfiber|      null|   ruffles|      animal|             0|
|1190380701|vintage|   High|   0.0|   L|Automn|  o-neck|        full|  natural|     polyster|      null|      null|       print|             0|
| 966005983|  Brief|Average|   4.6|   L|Spring|  o-neck|        full|  natural|         silk|   chiffon|embroidary|       print|             1|
| 876339541|   cute|    Low|   4.5|   M|Summer|  o-neck|   butterfly|  natural|chiffonfabric|   chiffon|       bow|         dot|             0|
+----------+-------+-------+------+----+------+--------+------------+---------+-------------+----------+----------+------------+--------------+
only showing top 5 rows