Colab

x=0

import pandas as pd

df = pd.read_csv('diamonds.csv')

type(df)

pandas.core.frame.DataFrame
# df.head() 
df.head(10) 


# carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
5 6 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
6 7 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
7 8 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
8 9 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
9 10 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39
df.tail()

# carat cut color clarity depth table price x y z
1106 1107 0.91 Premium I SI2 62.0 59.0 2913 6.18 6.23 3.85
1107 1108 0.83 Premium E SI1 62.2 59.0 2913 6.05 5.97 3.74
1108 1109 0.85 Ideal G SI2 62.0 57.0 2913 6.10 6.02 3.76
1109 1110 0.80 Very Good F SI1 63.5 55.0 2914 5.86 5.89 3.73
1110 1111 0.73 Ideal E SI1 61.4 58.0 2914 5.76 5.80 3.55
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111 entries, 0 to 1110
Data columns (total 11 columns):
#          1111 non-null int64
carat      1111 non-null float64
cut        1111 non-null object
color      1111 non-null object
clarity    1111 non-null object
depth      1111 non-null float64
table      1111 non-null float64
price      1111 non-null int64
x          1111 non-null float64
y          1111 non-null float64
z          1111 non-null float64
dtypes: float64(6), int64(2), object(3)
memory usage: 82.5+ KB
df.describe()

# carat depth table price x y z
count 1111.00000 1111.000000 1111.000000 1111.000000 1111.000000 1111.000000 1111.000000 1111.000000
mean 556.00000 0.684086 61.735914 57.708731 2456.058506 5.588587 5.581683 3.447642
std 320.86238 0.198297 1.731228 2.430179 864.254630 0.634599 0.623776 0.396959
min 1.00000 0.200000 53.000000 52.000000 326.000000 3.790000 3.750000 2.270000
25% 278.50000 0.700000 60.900000 56.000000 2777.000000 5.620000 5.610000 3.440000
50% 556.00000 0.710000 61.800000 57.000000 2822.000000 5.760000 5.760000 3.550000
75% 833.50000 0.790000 62.600000 59.000000 2862.500000 5.910000 5.910000 3.640000
max 1111.00000 1.270000 69.500000 70.000000 2914.000000 7.120000 7.050000 4.330000
df['carat'].head()

0    0.23
1    0.21
2    0.23
3    0.29
4    0.31
Name: carat, dtype: float64
df[['carat', 'price']].head()
# df[  ['carat', 'price']  ].head()

carat price
0 0.23 326
1 0.21 326
2 0.23 327
3 0.29 334
4 0.31 335
df.iloc[0]

#              1
carat       0.23
cut        Ideal
color          E
clarity      SI2
depth       61.5
table         55
price        326
x           3.95
y           3.98
z           2.43
Name: 0, dtype: object
# df.iloc[0]['carat']
df.iloc[0][['carat', 'price']]

carat    0.23
price     326
Name: 0, dtype: object
df.iloc[:10]

# carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
5 6 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
6 7 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
7 8 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
8 9 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
9 10 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39
df.iloc[ [0, 2, 1, 0] ]

# carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
row0 = df.iloc[0]
row0

#              1
carat       0.23
cut        Ideal
color          E
clarity      SI2
depth       61.5
table         55
price        326
x           3.95
y           3.98
z           2.43
Name: 0, dtype: object
row0['price']

326
row0.iloc[1]

0.23
row0.loc['price']

326
carats = df.set_index('carat')
carats.head()

# cut color clarity depth table price x y z
carat
0.23 1 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
0.21 2 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
0.23 3 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
0.29 4 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
0.31 5 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
carats.loc[0.21]

#                2
cut        Premium
color            E
clarity        SI1
depth         59.8
table           61
price          326
x             3.89
y             3.84
z             2.31
Name: 0.21, dtype: object
carats.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 1111 entries, 0.23 to 0.73
Data columns (total 10 columns):
#          1111 non-null int64
cut        1111 non-null object
color      1111 non-null object
clarity    1111 non-null object
depth      1111 non-null float64
table      1111 non-null float64
price      1111 non-null int64
x          1111 non-null float64
y          1111 non-null float64
z          1111 non-null float64
dtypes: float64(5), int64(2), object(3)
memory usage: 114.5+ KB
carats.reset_index()

carat # cut color clarity depth table price x y z
0 0.23 1 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 2 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 3 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 4 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 5 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
... ... ... ... ... ... ... ... ... ... ... ...
1106 0.91 1107 Premium I SI2 62.0 59.0 2913 6.18 6.23 3.85
1107 0.83 1108 Premium E SI1 62.2 59.0 2913 6.05 5.97 3.74
1108 0.85 1109 Ideal G SI2 62.0 57.0 2913 6.10 6.02 3.76
1109 0.80 1110 Very Good F SI1 63.5 55.0 2914 5.86 5.89 3.73
1110 0.73 1111 Ideal E SI1 61.4 58.0 2914 5.76 5.80 3.55

1111 rows × 11 columns

df.head()

# carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
df.head() *2

# carat cut color clarity depth table price x y z
0 2 0.46 IdealIdeal EE SI2SI2 123.0 110.0 652 7.90 7.96 4.86
1 4 0.42 PremiumPremium EE SI1SI1 119.6 122.0 652 7.78 7.68 4.62
2 6 0.46 GoodGood EE VS1VS1 113.8 130.0 654 8.10 8.14 4.62
3 8 0.58 PremiumPremium II VS2VS2 124.8 116.0 668 8.40 8.46 5.26
4 10 0.62 GoodGood JJ SI2SI2 126.6 116.0 670 8.68 8.70 5.50
(df['carat'] + 1).head()

0    1.23
1    1.21
2    1.23
3    1.29
4    1.31
Name: carat, dtype: float64
df['expensive'] = df['price'] > 1000
df.head()

# carat cut color clarity depth table price x y z expensive
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 False
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 False
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 False
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 False
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 False
df.tail()

# carat cut color clarity depth table price x y z expensive
1106 1107 0.91 Premium I SI2 62.0 59.0 2913 6.18 6.23 3.85 True
1107 1108 0.83 Premium E SI1 62.2 59.0 2913 6.05 5.97 3.74 True
1108 1109 0.85 Ideal G SI2 62.0 57.0 2913 6.10 6.02 3.76 True
1109 1110 0.80 Very Good F SI1 63.5 55.0 2914 5.86 5.89 3.73 True
1110 1111 0.73 Ideal E SI1 61.4 58.0 2914 5.76 5.80 3.55 True
def is_large(row):
    vol = row['x'] * row['y'] * row['z']
    if vol > 20:
        return "big"

    else:
        return "small"

df['volume_label'] = df.apply(is_large, axis=1)
df.head()

# carat cut color clarity depth table price x y z expensive volume_label
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 False big
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 False big
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 False big
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 False big
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 False big
df['expensive'].replace({True: 'yes!', False: 'no...'})

0       no...
1       no...
2       no...
3       no...
4       no...
        ...  
1106     yes!
1107     yes!
1108     yes!
1109     yes!
1110     yes!
Name: expensive, Length: 1111, dtype: object
def depth_as_inch(row):
    inch = row['depth'] / 3
    row.drop('depth')
    return row

df.apply(depth_as_inch, axis=1)

# carat cut color clarity depth table price x y z expensive volume_label
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 False big
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 False big
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 False big
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 False big
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 False big
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1106 1107 0.91 Premium I SI2 62.0 59.0 2913 6.18 6.23 3.85 True big
1107 1108 0.83 Premium E SI1 62.2 59.0 2913 6.05 5.97 3.74 True big
1108 1109 0.85 Ideal G SI2 62.0 57.0 2913 6.10 6.02 3.76 True big
1109 1110 0.80 Very Good F SI1 63.5 55.0 2914 5.86 5.89 3.73 True big
1110 1111 0.73 Ideal E SI1 61.4 58.0 2914 5.76 5.80 3.55 True big

1111 rows × 13 columns

df['carat'] + 1

0       1.23
1       1.21
2       1.23
3       1.29
4       1.31
        ... 
1106    1.91
1107    1.83
1108    1.85
1109    1.80
1110    1.73
Name: carat, Length: 1111, dtype: float64
df

# carat cut color clarity depth table price x y z expensive volume_label
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 False big
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 False big
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 False big
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 False big
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 False big
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1106 1107 0.91 Premium I SI2 62.0 59.0 2913 6.18 6.23 3.85 True big
1107 1108 0.83 Premium E SI1 62.2 59.0 2913 6.05 5.97 3.74 True big
1108 1109 0.85 Ideal G SI2 62.0 57.0 2913 6.10 6.02 3.76 True big
1109 1110 0.80 Very Good F SI1 63.5 55.0 2914 5.86 5.89 3.73 True big
1110 1111 0.73 Ideal E SI1 61.4 58.0 2914 5.76 5.80 3.55 True big

1111 rows × 13 columns

df.groupby('cut').mean()

# carat depth table price x y z expensive
cut
Fair 606.666667 0.902424 63.457576 60.242424 2800.636364 6.105909 6.026667 3.847273 0.984848
Good 507.020202 0.649394 62.322222 58.426263 2210.737374 5.421313 5.433535 3.381111 0.737374
Ideal 586.541333 0.664773 61.695200 55.973067 2497.981333 5.559173 5.569333 3.432240 0.853333
Premium 567.831804 0.690979 61.223853 58.776758 2464.987768 5.640826 5.597248 3.439205 0.840979
Very Good 499.372951 0.659549 61.781148 57.968443 2385.991803 5.491721 5.519549 3.401516 0.811475
df.groupby('cut')[['price', 'carat']].mean()

price carat
cut
Fair 2800.636364 0.902424
Good 2210.737374 0.649394
Ideal 2497.981333 0.664773
Premium 2464.987768 0.690979
Very Good 2385.991803 0.659549
clarity_cut = df.groupby(['cut', 'clarity'])[['price', 'carat']].mean()
clarity_cut.head()

price carat
cut clarity
Fair I1 2824.250000 1.100000
IF 2861.000000 0.620000
SI1 2855.562500 0.925625
SI2 2835.172414 0.956207
VS1 2853.250000 0.807500
clarity_cut['price'].sort_values(ascending = False)

cut        clarity
Fair       IF         2861.000000
           SI1        2855.562500
           VS1        2853.250000
Ideal      I1         2846.428571
Good       I1         2845.000000
Very Good  IF         2841.750000
Premium    IF         2837.000000
Fair       SI2        2835.172414
Good       VVS2       2827.333333
Fair       I1         2824.250000
Ideal      IF         2814.428571
Fair       VVS1       2801.000000
Ideal      VVS2       2725.631579
Very Good  SI2        2665.965517
Premium    VVS2       2664.076923
           I1         2630.166667
Ideal      SI1        2592.385246
           VS2        2577.722222
Good       VS2        2577.107143
Fair       VS2        2576.000000
Premium    VS1        2571.245283
           VS2        2559.631579
Very Good  VS2        2468.000000
           SI1        2405.694444
Premium    SI1        2374.200000
           SI2        2360.282051
Ideal      VVS1       2353.363636
           SI2        2303.129032
Very Good  VS1        2295.511111
           I1         2268.750000
Ideal      VS1        2261.923077
Good       VS1        2152.277778
           VVS1       2110.000000
           SI2        2092.076923
Very Good  VVS1       2061.333333
           VVS2       1961.687500
Good       SI1        1883.250000
Premium    VVS1       1674.875000
Name: price, dtype: float64
df['cut'].value_counts()

Ideal        375
Premium      327
Very Good    244
Good          99
Fair          66
Name: cut, dtype: int64
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)
df.head()

# carat cut color clarity depth table price x y z expensive volume_label
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 False big
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 False big
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 False big
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 False big
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 False big
df.plot.scatter(x='x', y='y')

<matplotlib.axes._subplots.AxesSubplot at 0xc57e130>

svg

df.plot.scatter(x='carat', y='price')

<matplotlib.axes._subplots.AxesSubplot at 0xc562cf0>

svg

df.groupby('cut')['price'].mean().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0xc6b53d0>

svg


df['cut'] = df['cut'].astype(pd.CategoricalDtype(['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], ordered=True))


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111 entries, 0 to 1110
Data columns (total 13 columns):
#               1111 non-null int64
carat           1111 non-null float64
cut             1111 non-null category
color           1111 non-null object
clarity         1111 non-null object
depth           1111 non-null float64
table           1111 non-null float64
price           1111 non-null int64
x               1111 non-null float64
y               1111 non-null float64
z               1111 non-null float64
expensive       1111 non-null bool
volume_label    1111 non-null object
dtypes: bool(1), category(1), float64(6), int64(2), object(3)
memory usage: 84.8+ KB
df.groupby('cut')['price'].mean().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0xc51cd70>

svg

df.groupby('cut')['price'].mean().sort_values().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0xdd60cd0>

svg

mask = df['cut'] == "Ideal"
df[mask].head()

# carat cut color clarity depth table price x y z expensive volume_label
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 False big
11 12 0.23 Ideal J VS1 62.8 56.0 340 3.93 3.90 2.46 False big
13 14 0.31 Ideal J SI2 62.2 54.0 344 4.35 4.37 2.71 False big
16 17 0.30 Ideal I SI2 62.0 54.0 348 4.31 4.34 2.68 False big
39 40 0.33 Ideal I SI2 61.8 55.0 403 4.49 4.51 2.78 False big
mask = (df['cut'] == "Ideal") & (df['color'] == 'J')
df[mask].head()

# carat cut color clarity depth table price x y z expensive volume_label
11 12 0.23 Ideal J VS1 62.8 56.0 340 3.93 3.90 2.46 False big
13 14 0.31 Ideal J SI2 62.2 54.0 344 4.35 4.37 2.71 False big
41 42 0.33 Ideal J SI1 61.1 56.0 403 4.49 4.55 2.76 False big
460 461 0.90 Ideal J VS2 62.8 55.0 2817 6.20 6.16 3.88 True big
681 682 0.75 Ideal J SI1 61.5 56.0 2850 5.83 5.87 3.60 True big
df[   (df['cut'] == "Ideal") & (df['color'] == 'J')  ].head()

# carat cut color clarity depth table price x y z expensive volume_label
11 12 0.23 Ideal J VS1 62.8 56.0 340 3.93 3.90 2.46 False big
13 14 0.31 Ideal J SI2 62.2 54.0 344 4.35 4.37 2.71 False big
41 42 0.33 Ideal J SI1 61.1 56.0 403 4.49 4.55 2.76 False big
460 461 0.90 Ideal J VS2 62.8 55.0 2817 6.20 6.16 3.88 True big
681 682 0.75 Ideal J SI1 61.5 56.0 2850 5.83 5.87 3.60 True big
df[df['price'] > 1000].head()

# carat cut color clarity depth table price x y z expensive volume_label
90 91 0.70 Ideal E SI1 62.5 57.0 2757 5.70 5.72 3.57 True big
91 92 0.86 Fair E SI2 55.1 69.0 2757 6.45 6.33 3.52 True big
92 93 0.70 Ideal G VS2 61.6 56.0 2757 5.70 5.67 3.50 True big
93 94 0.71 Very Good E VS2 62.4 57.0 2759 5.68 5.73 3.56 True big
94 95 0.78 Very Good G SI2 63.8 56.0 2759 5.81 5.85 3.72 True big
df.loc[[90, 91]]

# carat cut color clarity depth table price x y z expensive volume_label
90 91 0.70 Ideal E SI1 62.5 57.0 2757 5.70 5.72 3.57 True big
91 92 0.86 Fair E SI2 55.1 69.0 2757 6.45 6.33 3.52 True big