import pandas as pd


from pandas import Series, DataFrame


d = {"one": 7, "two": 8, "three": 9}
d

{'one': 7, 'two': 8, 'three': 9}


# dict to Series
s = Series(d)
s

# IP  index    value
# 0   one      7
# 1   two      8
# 2   three    9
# dtype: int64

one      7
two      8
three    9
dtype: int64


print(s.loc["one"])
print(s.loc["three"])
print(s.loc["two"])

7
9
8


print(s.iloc[0])
print(s.iloc[2])

7
9


# Series to dict
dict(s)

{'one': 7, 'two': 8, 'three': 9}


num_list = [100, 200, 300]
print(type(num_list))

num_series = Series(num_list) # create Series from list
print(type(num_series))

<class 'list'>
<class 'pandas.core.series.Series'>


# displaying a list:
num_list

[100, 200, 300]


# displaying a Series:
num_series

# IP  index value
# 0   0      100
# 1   1      200
# 2   2      300
# dtype: int64

0    100
1    200
2    300
dtype: int64


num_series.iloc[0], num_series.loc[0]

(100, 100)


list(num_series)

[100, 200, 300]


letter_list = ["A", "B", "C", "D"]
letter_series = Series(letter_list)
letter_series

0    A
1    B
2    C
3    D
dtype: object


letter_list[0]

'A'


letter_series.loc[0]

'A'


letter_list[3]

'D'


letter_series.iloc[3] # integer position is the same as index

'D'


letter_list[-1]

'D'


# but be careful!  Series don't support negative indexes to the extent that lists do
try:
    print(letter_series.loc[-1]) # BAD
except Exception as e:
    print(type(e))
    
letter_series.iloc[-1] # OK

<class 'KeyError'>

'D'


print("list slice:")
print(letter_list[:2])
print("\nseries slice:")
print(letter_series.iloc[:2])

list slice:
['A', 'B']

series slice:
0    A
1    B
dtype: object


print("list slice:")
print(letter_list[2:])
print("\nseries slice:")
print(letter_series.iloc[2:])

list slice:
['C', 'D']

series slice:
2    C
3    D
dtype: object


# although we CANNOT always do negative indexing with a Series
# we CAN use negative numbers in a Series slice
print("list slice:")
print(letter_list[:-1])
print("\nseries slice:")
print(letter_series.iloc[:-1])

list slice:
['A', 'B', 'C']

series slice:
0    A
1    B
2    C
dtype: object


s = Series({0: "A", 1: "B", 2: "C"})
s

0    A
1    B
2    C
dtype: object


s.iloc[1:]

1    B
2    C
dtype: object


orig_nums = [100, 200, 300]
new_nums = [x+1 for x in orig_nums] # list comprehension
new_nums

[101, 201, 301]


nums = Series([100, 200, 300])
nums + 1

0    101
1    201
2    301
dtype: int64


[1, 2, 3] * 3

[1, 2, 3, 1, 2, 3, 1, 2, 3]


Series([1, 2, 3]) * 3

0    3
1    6
2    9
dtype: int64


[10, 20] + [3, 4]

[10, 20, 3, 4]


Series([10, 20]) + Series([3, 4])

0    13
1    24
dtype: int64


Series([10, 20, 30]) + Series([1, 2])

0    11.0
1    22.0
2     NaN
dtype: float64


print(nums)
nums + 1
print(nums)
nums = nums + 1
nums

0    100
1    200
2    300
dtype: int64
0    100
1    200
2    300
dtype: int64

0    101
1    201
2    301
dtype: int64


nums -= 1
nums

0    100
1    200
2    300
dtype: int64


nums / 5 # like regular division, this produces float type, which is represented as float64

0    20.0
1    40.0
2    60.0
dtype: float64


l1 = [1, 2, 3]
l2 = [4, 5, 6]
s1 = pd.Series(l1)
s2 = pd.Series(l2)
print(s1)
print(s2)
s1 * s2

0    1
1    2
2    3
dtype: int64
0    4
1    5
2    6
dtype: int64

0     4
1    10
2    18
dtype: int64


print(s1)
print(s2)
s1 / s2

0    1
1    2
2    3
dtype: int64
0    4
1    5
2    6
dtype: int64

0    0.25
1    0.40
2    0.50
dtype: float64


print(s1)
print(s2)
s2 ** s1

0    1
1    2
2    3
dtype: int64
0    4
1    5
2    6
dtype: int64

0      4
1     25
2    216
dtype: int64


pd.Series(["a", "Alice", True, 1, 4.5, [1,2], {"a":"Alice"}])

0                 a
1             Alice
2              True
3                 1
4               4.5
5            [1, 2]
6    {'a': 'Alice'}
dtype: object


s = pd.Series({"A": 10, "B": 20})
print(s)
s["Z"] = 100
s

A    10
B    20
dtype: int64

A     10
B     20
Z    100
dtype: int64


s1 = pd.Series({"A": 10, "B": 20})
s2 = pd.Series({"C": 1, "D": 2})
print(s1)
print(s2)

new_s = pd.concat( [s1, s2] ) 
# retains index from the original Series as such
# because of index retention, this is confusing operation for Series created using lists
new_s

A    10
B    20
dtype: int64
C    1
D    2
dtype: int64

A    10
B    20
C     1
D     2
dtype: int64


nums = Series([1, 9, 8, 2])
nums

0    1
1    9
2    8
3    2
dtype: int64


nums > 5

0    False
1     True
2     True
3    False
dtype: bool


nums = Series([7, 5, 8, 2, 3])
nums

0    7
1    5
2    8
3    2
4    3
dtype: int64


mod_2 = nums % 2
mod_2

0    1
1    1
2    0
3    0
4    1
dtype: int64


odd = mod_2 == 1
odd

0     True
1     True
2    False
3    False
4     True
dtype: bool


s1 = pd.Series({"A": 10, "B": 20})
s2 = pd.Series({"B": 1, "A": 2})
print(s1)
print(s2)

A    10
B    20
dtype: int64
B    1
A    2
dtype: int64


s1 + s2 # index alignment

A    12
B    21
dtype: int64


Series([100, 200, 300])

0    100
1    200
2    300
dtype: int64


# we can create our own index by passing argument to index param
nums1 = Series([100, 200, 300], index = [2, 1, 0]) 
nums1

2    100
1    200
0    300
dtype: int64


X = Series([100, 200, 300])
Y = Series([10, 20, 30])
Z = Series([10, 20, 30], index = [2,1,0])


print(X)
print(Y)
print(Z)

0    100
1    200
2    300
dtype: int64
0    10
1    20
2    30
dtype: int64
2    10
1    20
0    30
dtype: int64

X+Y

0    110
1    220
2    330
dtype: int64

X+Z

0    130
1    220
2    310
dtype: int64


letters = Series(["A", "B", "C", "D"])
letters

0    A
1    B
2    C
3    D
dtype: object


bool_series = Series([True, True, False, False])
bool_series

0     True
1     True
2    False
3    False
dtype: bool


# we can used the bool_series almost like an index
# to pull values out of letters:

letters[bool_series]

0    A
1    B
dtype: object


# We could also create the Boolean Series on the fly:
letters[Series([True, True, False, False])]

0    A
1    B
dtype: object


# Let's grab the last two letterrs:
letters[Series([False, False, True, True])]

2    C
3    D
dtype: object


# Let's grab the first and last (can't do this with a slice):
letters[Series([True, False, False, True])]

0    A
3    D
dtype: object


s = Series({"w": 6, "x": 7, "y": 8, "z": 9})
b = Series({"w": True, "x": False, "y": False, "z": True})
s[b]

w    6
z    9
dtype: int64


# we want to pull out 9 and 8
S = Series([1, 9, 2, 3, 8])
S

0    1
1    9
2    2
3    3
4    8
dtype: int64


B = S > 5
B

0    False
1     True
2    False
3    False
4     True
dtype: bool


# this will pull out values from S at index 1 and 4,
# because the values in B at index 1 and 4 are True
S[B]

1    9
4    8
dtype: int64


print(S)
S[S > 5]

0    1
1    9
2    2
3    3
4    8
dtype: int64

1    9
4    8
dtype: int64


words = Series(["APPLE", "boy", "CAT", "dog"])
words

0    APPLE
1      boy
2      CAT
3      dog
dtype: object


# we can use .str.upper() to get upper case version of words
upper_words = words.str.upper()
upper_words

0    APPLE
1      BOY
2      CAT
3      DOG
dtype: object


# B will be True where the original word equals the upper-case version
B = words == upper_words
B

0     True
1    False
2     True
3    False
dtype: bool


# pull out the just words that were orginally uppercase
words[B]

0    APPLE
2      CAT
dtype: object


words[words == upper_words]

0    APPLE
2      CAT
dtype: object


words[words == words.str.upper()]

0    APPLE
2      CAT
dtype: object


nums = Series([11, 12, 19, 18, 15, 17])
nums

0    11
1    12
2    19
3    18
4    15
5    17
dtype: int64


nums[nums % 2 == 1]

0    11
2    19
4    15
5    17
dtype: int64


s = Series([5, 55, 11, 12, 999])
s

0      5
1     55
2     11
3     12
4    999
dtype: int64


s >= 10

0    False
1     True
2     True
3     True
4     True
dtype: bool


s <= 20

0     True
1    False
2     True
3     True
4    False
dtype: bool


# we have to use symbols & and |
# and / or don't work with pandas
(s >= 10) & (s <= 20)

0    False
1    False
2     True
3     True
4    False
dtype: bool


s[(s >= 10) & (s <= 20)]

2    11
3    12
dtype: int64


# boolean operators have higher precendence
# Lack of parenthesis will cause ValueError

try:
    s[s >= 10 | s <= 20]
except ValueError as e:
    print("Reason for crash:", str(e))

Reason for crash: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


print(s)
s[ (s < 12) | (s > 33)]

0      5
1     55
2     11
3     12
4    999
dtype: int64

0      5
1     55
2     11
4    999
dtype: int64


# Same operations using & and ~ (NOT)
s[ ~((s > 12) & (s < 33))]

0      5
1     55
2     11
3     12
4    999
dtype: int64


name_column = Series(["Alice", "Bob", "Cindy", "Dan"])
score_column = Series([100, 150, 160, 120])

table = DataFrame({'name': name_column, 'score': score_column})
table


data = {"name": ["Alice", "Bob", "Cindy", "Dan"],
        "score": [100, 150, 160, 120]}
df = DataFrame(data)
df


# we'll use the DataFrame of scores defined
# in the previous section
df


# let's grab the name cell using DataFrame["COL NAME"]
df["name"]

0    Alice
1      Bob
2    Cindy
3      Dan
Name: name, dtype: object


# or we could extract the score column:
df["score"]

0    100
1    150
2    160
3    120
Name: score, dtype: int64


# if we want to generate some simple stats over a column,
# we can use .describe()
df["score"].describe()

count      4.000000
mean     132.500000
std       27.537853
min      100.000000
25%      115.000000
50%      135.000000
75%      152.500000
max      160.000000
Name: score, dtype: float64


# lookup is done for columns by default (df[x] looks up column named x)
# we can also lookup a row, but we need to use df.loc[y].  ("loc" stands for location)
# for example, let's get Bob's row:
df.loc[1]

name     Bob
score    150
Name: 1, dtype: object


# if we want a particular cell, we can use df.loc[row,col].
# for example, this is Bob's score:
df.loc[1, "score"]

150


# we can also use this to modify cells:
df.loc[1, "score"] += 5
df


# movies is a DataFrame
movies = pd.read_csv('IMDB-Movie-Data.csv')

# how many are there?
print("Number of movies:", len(movies))

Number of movies: 998


# it's large, but we can preview the first few with DataFrame.head()
movies.head()


# we can pull out Runtime minutes if we like
runtime = movies["Runtime"]

# it's still long (same length as movies), but let's preview the first 10 runtime minutes
runtime.head(10)

0    121
1    124
2    117
3    108
4    123
5    103
6    128
7     89
8    141
9    116
Name: Runtime, dtype: int64


# what is the mean runtime, in hours?
runtime.mean() / 60

1.8861723446893788


# what if we want stats about movies from 2016?
# use .head() on results to make it shorter
(movies["Year"] == 2016).head()

0    False
1    False
2     True
3     True
4     True
Name: Year, dtype: bool


movies_2016 = movies[movies["Year"] == 2016]
print("there are " + str(len(movies_2016)) + " movies in 2016")
movies_2016.head(10)

there are 296 movies in 2016


# let's get some general stats about movies from 2016
movies_2016.describe()

	Index	Title	Genre	Director	Cast	Year	Runtime	Rating	Revenue
0	0	Guardians of the Galaxy	Action,Adventure,Sci-Fi	James Gunn	Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...	2014	121	8.1	333.13
1	1	Prometheus	Adventure,Mystery,Sci-Fi	Ridley Scott	Noomi Rapace, Logan Marshall-Green, Michael ...	2012	124	7.0	126.46M
2	2	Split	Horror,Thriller	M. Night Shyamalan	James McAvoy, Anya Taylor-Joy, Haley Lu Richar...	2016	117	7.3	138.12M
3	3	Sing	Animation,Comedy,Family	Christophe Lourdelet	Matthew McConaughey,Reese Witherspoon, Seth Ma...	2016	108	7.2	270.32
4	4	Suicide Squad	Action,Adventure,Fantasy	David Ayer	Will Smith, Jared Leto, Margot Robbie, Viola D...	2016	123	6.2	325.02

	Index	Title	Genre	Director	Cast	Year	Runtime	Rating	Revenue
2	2	Split	Horror,Thriller	M. Night Shyamalan	James McAvoy, Anya Taylor-Joy, Haley Lu Richar...	2016	117	7.3	138.12M
3	3	Sing	Animation,Comedy,Family	Christophe Lourdelet	Matthew McConaughey,Reese Witherspoon, Seth Ma...	2016	108	7.2	270.32
4	4	Suicide Squad	Action,Adventure,Fantasy	David Ayer	Will Smith, Jared Leto, Margot Robbie, Viola D...	2016	123	6.2	325.02
5	5	The Great Wall	Action,Adventure,Fantasy	Yimou Zhang	Matt Damon, Tian Jing, Willem Dafoe, Andy Lau	2016	103	6.1	45.13
6	6	La La Land	Comedy,Drama,Music	Damien Chazelle	Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....	2016	128	8.3	151.06M
7	7	Mindhorn	Comedy	Sean Foley	Essie Davis, Andrea Riseborough, Julian Barrat...	2016	89	6.4	0
8	8	The Lost City of Z	Action,Adventure,Biography	James Gray	Charlie Hunnam, Robert Pattinson, Sienna Mille...	2016	141	7.1	8.01
9	9	Passengers	Adventure,Drama,Romance	Morten Tyldum	Jennifer Lawrence, Chris Pratt, Michael Sheen,...	2016	116	7.0	100.01M
10	10	Fantastic Beasts and Where to Find Them	Adventure,Family,Fantasy	David Yates	Eddie Redmayne, Katherine Waterston, Alison Su...	2016	133	7.5	234.02
11	11	Hidden Figures	Biography,Drama,History	Theodore Melfi	Taraji P. Henson, Octavia Spencer, Janelle Mon...	2016	127	7.8	169.27M

	Index	Year	Runtime	Rating
count	296.000000	296.0	296.000000	296.000000
mean	374.986486	2016.0	107.337838	6.433446
std	299.342658	0.0	17.438533	1.023419
min	2.000000	2016.0	66.000000	2.700000
25%	105.750000	2016.0	94.000000	5.800000
50%	297.000000	2016.0	106.000000	6.500000
75%	615.250000	2016.0	118.000000	7.200000
max	997.000000	2016.0	163.000000	8.800000

Introduction to Pandas¶

Pandas Series¶

Careful with the Vocabulary!¶

Series vs. Dictionary¶

Series vs. List¶

Indexing and Slicing¶

Element-Wise Operations¶

Series insertion¶

Series concatenation¶

Boolean Element-Wise Operation¶

Data Alignment¶

Boolean Indexing¶

Combining Element-Wise Operations with Selection¶

Example 1: extract number > 5¶

Example 2: extract upper case letters¶

Example 3: extract odd numbers¶

Example 4: using and and/or or¶

Pandas DataFrame¶

Accessing DataFrame Values¶

Reading CSV Files¶

Conclusion¶