numpy
常用方法
import numpy as np
import matplotlib.pyplot as plt
array = np.array([[1,2,3],[4,5,6]])
arr = np.array([1,2,'q'])
img_array = plt.imread('./7.jpg')
plt.imshow(img_array)
img_arraymg = img_array - 100
plt.imshow(img_arraymg)
np.ones(shape=(3,4))
np.linspace(0,100,20)
np.arange(10,50,step=2)
np.random.randint(0,100,size=(5,4))
常用属性
arr = np.random.randint(0,100,size=(5,4))
arr.shape
arr.ndim
arr.size
arr.dtype
type(arr)
arr = np.array([1,2,3],dtype = 'int32')
arr.dtype
arr.dtype='uint8'
arr.dtype
索引&切片
arr = np.random.randint(0,100,size=(5,4))
arr[0]
arr[0:2]
arr[:,0:2]
arr[0:2,0:2]
arr[::-1]
arr[:,::-1]
arr[::-1,::-1]
img_array.shape
plt.imshow(img_array[:,::-1,:])
plt.imshow(img_array[200:400,1000:1500,:])
聚合&矩阵
arr.reshape((20,))
arr.reshape((4,5))
np.concatenate((arr,arr),axis=0)
np.concatenate((arr,arr),axis=1)
img_2 = np.concatenate((img_array2,img_array2,img_array2),axis=0)
plt.imshow(img_2)
arr.sum(axis=1)
arr.mean(axis=1)
- 常用数学函数
sin cos tan
,around(a,decimals)
a: 数组 decimals:舍入的小数位数,默认为0.如果为负,整数将四舍五入到小数点左侧位置
np.sin(arr)
np.around(3.8)
- 常用统计函数
- numpy.amin() 和 numpy.amax() :用于计算数组中的元素沿指定轴的最小值和最大值
- numpy.ptp(): 最大值和最小值的差
- numpy.median(): 计算数组a中元素的中位数
- 标准差 std() :标准差是一组数据平均值分散程度的一种度量
- 公式:std = sqrt(mean((x-x.mean())**2))
- 方差 var() :统计中的方差是每个样本本值与平均数之差平方值的平均数,即标准差就是方差的平方根
arr[1].std()
arr[1].var()
a1 = np.array([[2,1],[4,3]])
a2 = np.array([[1,2],[1,0]])
np.dot(a1,a2)
array([[3, 4],
[7, 8]])
pandas
Series
Series
是一种类似于一维数组的对象,由两部分组成
values
:一维数组
index
:相关的数据索引标签
Series
的创建
from pandas import Series
s = Series(data=[1,2,3,'four'])
import numpy as np
Series(data=np.random.randint(0,100,size=(3,),dtype='int64'))
s = Series(data=[1,2,3,'four'],index=['a','b','c','d'])
s
s.dtype
dic = {
'语文':100,
'数学':55,
'理综':333
}
s = Series(data=dic)
s
s[0]
s.语文
s[0:2]
s.shape
s.size
s.index
s.values
s.dtype
s = Series(data=np.random.randint(60,100,size=(10,)))
s.head(3)
s.tail(3)
s.isnull()
s.notnull
s1 = Series(data=[1,2,3],index=['a','b','c'])
s2 = Series(data=[1,2,3],index=['a','d','c'])
s = s1+s2
s
s.isnull()
DataFram
DataFrame
是一个表格型的数据结构,由按一定顺序排列的多列数据组成.既有行索引,也有列索引
- 行索引:
index
- 列索引:
columns
- 值:
values
DataFram
的创建
from pandas import DataFrame
df = DataFrame(data=[[1,2,3],[2,3,4]])
df
df = DataFrame(data=np.random.randint(10,99,size=(6,4)))
df
dic = {
'name':['zhangsna','lisi','wangwu'],
'salary':[8000,1000,2000]
}
df = DataFrame(data=dic)
df
df.values
df.columns
df.index
df.shape
df.dtypes
dic = {
'张三':[150,150,150,150],
'李四':[0,0,0,0]
}
df = DataFrame(data=dic,index=['语文','数学','英语','理综'])
df
索引&切片
from pandas import DataFrame
df = DataFrame(data=[[1,2,3],[2,3,4]])
df
df = DataFrame(data=np.random.randint(10,99,size=(6,4)))
df
dic = {
'name':['zhangsna','lisi','wangwu'],
'salary':[8000,1000,2000]
}
df = DataFrame(data=dic)
df
df.values
df.columns
df.index
df.shape
df.dtypes
dic = {
'张三':[150,150,150,150],
'李四':[0,0,0,0]
}
df = DataFrame(data=dic,index=['语文','数学','英语','理综'])
df
df = DataFrame(data=np.random.randint(10,99,size=(8,4)),columns=['a','b','c','d'])
df
df['a']
df.iloc[0]
df[['a','b']]
df.iloc[0]
df.iloc[[0,3]]
df.iloc[0,3]
df.loc[0,'c']
df.iloc[[1,2,3],2]
df[0:2]
df.iloc[:,0:2]
运算
时间数据类型转换
dic = {
'time':['2020-10-01','2021-11-11'],
'temp':[20,30]
}
df = DataFrame(data=dic)
df['time']
import pandas as pd
df['time'] = pd.to_datetime(df['time'])
df
df['time']
df.set_index('time',inplace=True)
df
案例演示
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import tushare as ts
df =ts.get_k_data(code='600519',start='2000-01-01')
df.to_csv('./maotai.csv')
df = pd.read_csv('./maotai.csv')
df.head()
df.drop(labels='Unnamed: 0',axis=1,inplace=True)
df.head()
df.info()
df['date'] = pd.to_datetime(df['date'])
df.info()
df.set_index('date',inplace=True)
df.head()
df.loc[(df['close']-df['open'])/df['open']>0.06].index
(df['open']-df['close'].shift(1))/df['close'].shift(1)<-0.02
df.loc[(df['open']-df['close'].shift(1))/df['close'].shift(1)<-0.02].index
new_df = df['2010-01':'2020-02']
df_monthly = new_df.resample('M').first()
cost = df_monthly['open'].sum()*100
cost
df_yearly = new_df.resample('A').last()[:-1]
resv = df_yearly['open'].sum()*1200
resv
last_month = 200*new_df['close'][-1]
last_month
resv+last_month-cost
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import tushare as ts
df = pd.read_csv('./maotai.csv')
df
df.drop(labels='Unnamed: 0',axis=1,inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date',inplace=True)
ma5 = df['close'].rolling(5).mean()
ma30 = df['close'].rolling(30).mean()
ma30
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(ma5[50:180])
plt.plot(ma30[50:180])
ma5 = ma5[30:]
ma30 = ma30[30:]
df = df[30:]
s1 = ma5<ma30
s2 = ma5>ma30
df
death_ex = s1 & s2.shift(1)
death_date = df.loc[death_ex].index
golden_ex = ~(s1 | s2.shift(1))
golden_date =df.loc[golden_ex].index
golden_date
s1 = Series(data=1,index=golden_date)
s2 = Series(data=0,index=death_date)
s =s1.append(s2)
s = s.sort_index()
s = s['2010':'2020']
s
first_money = 100000
money = first_money
hold = 0
for i in range(0,len(s)):
if s[i] == 1:
time = s.index[i]
p = df.loc[time]['open']
hand_count = money // (p*100)
hold = hand_count * 100
money -=(hold*p)
else:
death_time = s.index[i]
p_death = df.loc[death_time]['open']
money+= (p_death*hold)
hold = 0
last_monry = hold*df['close'][-1]
money+last_monry-first_money
基于pandas的数据清洗
处理丢失数据(空值)
- 在数据分析中需要使用浮点型的空而不是对象类型的空
- 在pandas中遇到None形式的空值则pandas会将其强转成NAN的形式
import numpy as np
type(None)
type(np.nan)
import pandas as pd
from pandas import DataFrame,Series
df = DataFrame(data=np.random.randint(0,100,size=(8,6)))
df.iloc[2,3] = None
df.iloc[4,4] = np.nan
df.iloc[5,5] = None
df
df.isnull().any(axis=1)
df.loc[df.isnull().any(axis=1)]
drop_index = df.loc[df.isnull().any(axis=1)].index
df.drop(labels=drop_index,axis=0)
df.notnull().all(axis=1)
df.loc[df.notnull().all(axis=1)]
df.dropna(axis=0)
df.fillna(method='ffill',axis=1).fillna(method='bfill',axis=1)
处理重复数据和异常数据
df = DataFrame(data=np.random.randint(0,100,size=(8,4)))
df.iloc[2] = [0,0,0,0]
df.iloc[4] = [0,0,0,0]
df.iloc[6] = [0,0,0,0]
df
df.drop_duplicates(keep='first')
df = DataFrame(data=np.random.random(size=(1000,3)),columns=['A','B','C'])
df
twice_std = df['C'].std() * 2
df['C']>twice_std
df.loc[~(df['C']>twice_std)]
DataFrame的级联和合并操作
import numpy as np
import pandas as pd
from pandas import DataFrame
df1 = DataFrame(data=np.random.randint(0,100,size=(5,3)),columns=['A','B','C'])
df2 = DataFrame(data=np.random.randint(0,100,size=(5,4)),columns=['A','B','C','D'])
df1
df2
pd.concat((df1,df2),axis=0)
pd.concat((df1,df2),axis=0,join='inner')
df1.append(df2)
- 合并(数据合并)
- merge与concat的区别在于,merge需要依据某一共同列来合并
- 使用pd.merge()合并时,会自动根据两者相同column名称的那一列,作为key来进行合并
- 注意每一列元素的顺序不要求一致
df1 = DataFrame({'employee':['Bob','jack','lisa'],
'group':['Accounting','Engineering','Engineering'],})
df1
df2 = DataFrame({'employee':['lisa','jack','Bob'],
'hire_date':[2018,2020,2021],})
df2
pd.merge(df2,df1,on='employee' )
pandas高级操作
替换操作
df = DataFrame(data=np.random.randint(0,100,size=(5,6)))
df
df.replace(to_replace=2,value='Two')
df.replace(to_replace={74:'one'})
df.replace(to_replace={3:50},value='five ten')
映射操作
- 创建一个映射关系表,把
values
元素的和一个特定的标签或者字符串绑定(给一个元素提供不同的表现形式)
dic = {
'name':['张三','李四'],
'salary':[1000,2000]
}
df = DataFrame(data=dic)
df
dic = {
'张三':'ssm',
'李四':'shaoshao'
}
df['e_name'] = df['name'].map(dic)
df
def salary_after(s):
return s - (s-500)*0.5
df['after_sal'] = df['salary'].map(salary_after)
df
排序实现的随机抽样
df =DataFrame(data=np.random.randint(0,100,size=(100,3)),columns=['A','B','C'])
df
df.take([2,0,1],axis=1)
df.take(np.random.permutation(3),axis=1).take(np.random.permutation(100),axis=0)[0:10]
分组聚合
df = DataFrame({
'item':['apple','banan'],
'price':[3,2],
'color':['red','yellow'],
'weight':[12,34]
})
df
df.groupby(by='item').groups
df.groupby(by='item')['price'].mean()
dic = df.groupby(by='color')['weight'].mean().to_dict()
df['mean_w'] = df['color'].map(dic)
- transform apply 也可传入一个
lambda
表达式
def my_mean(s):
m_sum = 0
for i in s:
m_sum += i
return m_sum / len(s)
df.groupby(by='item')['price'].transform(my_mean)
df.groupby(by='item')['price'].apply(my_mean)
透视表
- 透视表是一种对数据动态排布并且分类汇总的表格格式
pivot_table
四个重要参数index,values,columns,aggfunc
index
:分类汇总的分类条件
values
:对计算的数据进行筛选
aggfunc
:设置对数据聚合时进行的函数操作,默认aggfunc = 'mean'
columns
:设置列层次字段
fill_value
: None->0
df.pivot_table(index=['对手','主客场'],values=['得分','计分板'],aggfunc='sum',columns='对手',fill_value=0)
交叉表
- 计算分组的特殊透视图,对数据进行汇总
- `pd.corsstab(index,columns)
index
:分组数据,交叉表的行索引
columns
:交叉表的列索引
pd.crosstab(df.smoke,df.sex)
2012美国大选献金数据分析
import pandas as pd
df = pd.read_csv('./data/2012_Federal_Election_Commission_Database.csv')
df
df.info()
df.describe()
df.fillna(value='NOT PROVIDE',inplace=True)
df.info()
df['contb_receipt_amt'] <= 0
df.loc[df['contb_receipt_amt']<=0]
drop_index = df.loc[df['contb_receipt_amt']<=0].index
df.drop(labels=drop_index,axis=0,inplace=True)
df.loc[df['contb_receipt_amt']<=0]
print(1)
df.loc[df['contb_receipt_amt']<=0]
df['contb_receipt_amt'] <= 0
df.loc[df['contb_receipt_amt'] <=0 ]
drop_index = df.loc[df['contb_receipt_amt'] <=0 ].index
df.drop(labels=drop_index,axis=0)
df.describe()
df.fillna(value='NOT PROVIDE',inplace=True)
df.info()
df['contb_receipt_amt'] <= 0
df.loc[df['contb_receipt_amt'] <=0 ]
drop_index = df.loc[df['contb_receipt_amt'] <=0 ].index
df.drop(labels=drop_index,axis=0)
print(1)
df.loc[df['contb_receipt_amt'] <=0 ]
parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
df['party'] =df['cand_nm'].map(parties)
df.head()
df['party'].unique()
df['party'].value_counts()
df.groupby(by='party')['contb_receipt_amt'].sum()
df.groupby(by=['contb_receipt_dt','party'])['contb_receipt_amt'].sum()
months = {
'JAN':1,'FEB':2,'MAR':3,'APR':4,'MAY':5,'JUN':6,'JUL':7,'AUG':8,'SEP':9
,'OCT':10,'NOV':11,'DEC':12}
def transformDate(d):
day,month,year = d.split('-')
month = months[month]
return '20'+year+'-'+str(month)+'-'+day
df['contb_receipt_dt'] = df['contb_receipt_dt'].map(transformDate)
df.head()
df['contbr_occupation'] == 'DISABLED VETERAN'
df_old = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']
df_old.groupby(by='cand_nm')['contb_receipt_amt'].sum()
matplotlib绘图
线形图
import matplotlib.pyplot as plt
import numpy as np
x = np.array([1,2,3,4,5])
y = x + 3
plt.plot(x,y)
plt.plot(x+1,y-2)
plt.plot(x,y,x+3,y-1)
plt.figure(figsize=(5,9))
plt.plot(x,y)
plt.plot(x,y,label='x,y')
plt.plot(x+1,y-2,label='x+1,y-2')
plt.legend()
plt.plot(x,y)
plt.xlabel('temp')
plt.ylabel('dist')
plt.title('dist and temp')
fig = plt.figure()
plt.plot(x,y,label='x,y')
plt.legend()
fig.savefig('123.png')
plt.plot(x,y,c='red',alpha=0.5)
柱状图
plt.bar(x,y)
plt.barh(x,y)
直方图
data = [0,1,1,2,2,2,2,3,3,4,5,6,6,6,7,7,9,8]
plt.hist(data,bins=20)
饼图
arr = [1,2,3]
plt.pie(arr,labels=['a','b','c'],labeldistance=0.3,shadow=True,explode=[0.2,0.3,0.5],autopct='%0.6f%%')
散点图
x = np.array([1,3,5,6,7])
y = x**2+2
plt.scatter(x,y)