刚刚接触pandas的朋友,想了解数据结构,就一定要认识DataFrame,接下来给大家详细介绍!
初识DataFrame
importnumpyasnp
importpandasaspd
data={name:[Jack,Tom,LiSa],
age:[20,21,18],
city:[BeiJing,TianJin,ShenZhen]}
print(data)
print()
frame=pd.DataFrame(data)#创建DataFrame
print(frame)
print()
print(frame.index)#查看行索引
print()
print(frame.columns)#查看列索引
print()
print(frame.values)#查看值
{name:[Jack,Tom,LiSa],age:[20,21,18],city:[BeiJing,TianJin,ShenZhen]}
agecityname
BeiJingJack
TianJinTom
ShenZhenLiSa
RangeIndex(start=0,stop=3,step=1)
Index([age,city,name],dtype=object)
[[20BeiJingJack]
[21TianJinTom]
[18ShenZhenLiSa]]
创建DataFrame
方法一:由字典创建字典的key是列索引值可以是1.列表2.ndarray3.Series
#值是列表
data1={a:[1,2,3],
b:[4,5,6],
c:[7,8,9]
}
print(data1)
print()
print(pd.DataFrame(data1))#创建DataFrame
print()
#注意:index是可以给行索引重新命名columns是给列索引重新指定顺序如果没有该列那么产生NaN值
print(pd.DataFrame(data1,index=list(mnp),columns=list(bcad)))
print()
{a:[1,2,3],b:[4,5,6],c:[7,8,9]}
abc
bcad
mNaN
nNaN
pNaN
#值是ndarray注意:用ndarray创建DataFrame值的个数必须相同否则报错
data2={one:np.random.rand(3),
two:np.random.rand(3)
}
print(data2)
print()
print(pd.DataFrame(data2))
{one:array([0.,0.,0.]),two:array([0.,0.,0.])}
onetwo
00..
10..
20..
#值是Series--带有标签的一维数组注意:用Series创建DataFrame值的个数可以不同少的值用Nan填充
data3={one:pd.Series(np.random.rand(4)),
two:pd.Series(np.random.rand(5))
}
print(data3)
print()
df3=pd.DataFrame(data3)
print(df3)
print()
{one:00.
10.
20.
30.
dtype:float64,two:00.
10.
20.
30.
40.
dtype:float64}
onetwo
00.0.
10.0.
20.0.
30.0.
4NaN0.
方法二:通过二维数组直接创建
arr=np.random.rand(12).reshape(3,4)
print(arr)
print()
df1=pd.DataFrame(arr)
print(df1)
print()
df2=pd.DataFrame(arr,index=list(abc),columns=[one,two,three,four])#通过index和columns指定行索引和列索引
print(df2)
[[0....]
[0.569183...]
[0....]]
00....
10....
20....
onetwothreefour
a0....
b0....
c0....
方法三:由字典组成的列表创建DataFrame
data=[{one:1,two:2},{one:5,two:10,three:15}]#每一个字典在DataFrame里就是一行数据
print(data)
print()
df1=pd.DataFrame(data)
print(df1)
print()
df2=pd.DataFrame(data,index=list(ab),columns=[one,two,three,four])
print(df2)
[{one:1,two:2},{one:5,two:10,three:15}]
onethreetwo
01NaN2
.
onetwothreefour
a12NaNNaN
b.0NaN
创建方法四:由字典组成的字典
#columns为字典的keyindex为子字典的key
data={Jack:{age:1,country:China,sex:man},
LiSa:{age:18,country:America,sex:women},
Tom:{age:20,country:English}}
df1=pd.DataFrame(data)
print(df1)
print()
#注意:这里的index并不能给子字典的key(行索引)重新命名但可以给子字典的key重新排序若出现原数组没有的index那么就填充NaN值
df2=pd.DataFrame(data,index=[sex,age,country])
print(df2)
print()
df3=pd.DataFrame(data,index=list(abc))
print(df3)
print()
#columns给列索引重新排序若出现原数组没有的列索引填充NaN值
df4=pd.DataFrame(data,columns=[Tom,LiSa,Jack,TangMu])
print(df4)
JackLiSaTom
age
countryChinaAmericaEnglish
sexmanwomenNaN
JackLiSaTom
sexmanwomenNaN
age
countryChinaAmericaEnglish
JackLiSaTom
aNaNNaNNaN
bNaNNaNNaN
cNaNNaNNaN
TomLiSaJackTangMu
ageNaN
countryEnglishAmericaChinaNaN
sexNaNwomenmanNaN
DataFrame索引
选择行与列
选择列直接用df[列标签]
df=pd.DataFrame(np.random.rand(12).reshape(3,4)*,
index=[one,two,three],columns=[a,b,c,d])
print(df)
print()
print(df[a],,type(df[a]))#取一列
print()
print(df[[a,c]],,type(df[[a,c]]))#取多列
abcd
one92....
two91....
three3....
one92.
two91.
three3.
Name:a,dtype:float64classpandas.core.series.Series
ac
one92.19.
two91.4.
three3.14.classpandas.core.frame.DataFrame
选择行不能通过标签索引df[one]来选择行要用df.loc[one],loc就是针对行来操作的
print(df)
print()
print(df.loc[one],,type(df.loc[one]))#取一行
print()
print(df.loc[[one,three]],,type(df.loc[[one,three]]))#取不连续的多行
print()
abcd
one92....
two91....
three3....
a92.
b11.
c19.
d77.
Name:one,dtype:float64classpandas.core.series.Series
abcd
one92....
three3....classpandas.core.frame.DataFrame
loc支持切片索引--针对行并包含末端df.loc[one:three]
df=pd.DataFrame(np.random.rand(16).reshape(4,4)*,index=[one,two,three,four],
columns=[a,b,c,d])
print(df)
print()
print(df.loc[one:three])
print()
print(df[:3])#切片表示取连续的多行(尽量不用免得混淆)
abcd
one65.89419...
two31....
three54....
four45....
abcd
one65.89419...
two31....
three54....
abcd
one65.89419...
two31....
three54....
iloc也是对行来操作的只不过把行标签改成了行索引并且是不包含末端的
print(df)
print()
print(df.iloc[0])#取一行
print()
print(df.iloc[[0,2]])#取不连续的多行
print()
print(df.iloc[0:3])#不包含末端
abcd
one65.89419...
two31....
three54....
four45....
a65.894
b19.
c31.
d41.
Name:one,dtype:float64
abcd
one65.89419...
three54....
abcd
one65.89419...
two31....
three54....
布尔型索引
df=pd.DataFrame(np.random.rand(16).reshape(4,4)*,index=[one,two,three,four],
columns=[a,b,c,d])
print(df)
print()
d1=df50#d1为布尔型索引
print(d1)
print()
print(df[d1])#df根据d1只返回True的值False的值对应为NaN
print()
abcd
one91....
two49....
three78....
four79....
abcd
oneTrueTrueTrueTrue
twoFalseFalseFalseTrue
threeTrueTrueFalseTrue
fourTrueTrueFalseFalse
abcd
one91....
twoNaNNaNNaN69.
three78..NaN93.
four79..NaNNaN
选取某一列作为布尔型索引返回True所在行的所有列注意:不能选取多列作为布尔型索引
df=pd.DataFrame(np.random.rand(16).reshape(4,4)*,index=[one,two,three,four],
columns=[a,b,c,d],dtype=np.int64)
print(df)
print()
d2=df50
print(d2)
print()
print(df[d2])
abcd
one
two
three
four
oneFalse
twoFalse
threeTrue
fourFalse
Name:b,dtype:bool
abcd
three
选取多列作为布尔型索引返回True所对应的值False对应为NaN没有的列全部填充为NaN
df=pd.DataFrame(np.random.rand(16).reshape(4,4)*,index=[one,two,three,four],
columns=[a,b,c,d],dtype=np.int64)
print(df)
print()
d3=df[[a,c]]50
print(d3)
print()
print(df[d3])
abcd
one
two
three
four91677
ac
oneFalseFalse
twoTrueFalse
threeFalseTrue
fourFalseFalse
abcd
oneNaNNaNNaNNaN
two78.0NaNNaNNaN
threeNaNNaN84.0NaN
fourNaNNaNNaNNaN
多重索引
print(df)
abcd
one
two
three
four91677
print(df[a].loc[[one,three]])#取列再取行
print()
print(df[[a,c]].iloc[0:3])
one49
three6
Name:a,dtype:int64
ac
one
two
three
print(df.loc[[one,three]][[a,c]])#取行再取列
ac
one
three
print(df50)
print()
print(df[df50])
print()
print(df[df50][[a,b]])
abcd
oneFalseTrueFalseFalse
twoTrueFalseFalseTrue
threeFalseTrueTrueTrue
fourFalseTrueFalseTrue
abcd
oneNaN82.0NaNNaN
two78.0NaNNaN84.0
threeNaN84...0
fourNaN89.0NaN77.0
ab
oneNaN82.0
two78.0NaN
threeNaN84.0
fourNaN89.0
DataFrame基本技巧
importnumpyasnp
importpandasaspd
arr=np.random.rand(16).reshape(8,2)*10
#print(arr)
print()
print(len(arr))
print()
df=pd.DataFrame(arr,index=[chr(i)foriinrange(97,97+len(arr))],columns=[one,two])
print(df)
8
onetwo
a2..
b8.6320.
c6.262.
d6..
e6..
f2..
g6..
h9..
查看数据
print(df)
print()
print(df.head(2))#查看头部数据默认查看5条
print()
print(df.tail(3))#查看末尾数据默认查看5条
onetwo
a2..
b8.6320.
c6.262.
d6..
e6..
f2..
g6..
h9..
onetwo
a2..
b8.6320.
onetwo
f2..
g6..
h9..
转置
print(df)
onetwo
a2..
b8.6320.
c6.262.
d6..
e6..
f2..
g6..
h9..
print(df.T)
abcdefg\
one2..6326.....
two1.8270.3.9.3.6.7.
h
one9.
two3.
添加与修改
df=pd.DataFrame(np.random.rand(16).reshape(4,4),index=[one,two,three,four],columns=[a,b,c,d])
print(df)
print()
df.loc[five]=#增加一行
print(df)
print()
df[e]=10#增加一列
print(df)
print()
df[e]=#修改一列
print(df)
print()
df.loc[five]=#修改一行
print(df)
print()
abcd
one0.7810...
two0....
three0....
four0..5490..
abcd
one0.7810...
two0....
three0....
four0..5490..
five....
abcde
one0.7810...99007
two0....10
three0....10
four0..5490..10
five....00000
abcde
one0.7810...990071
two0....
three0....
four0..5490..
five....000001
abcde
one0.7810...990071
two0....
three0....
four0..5490..
five....
删除del(删除行)/drop(删除列指定axis=1删除行)
df=pd.DataFrame(np.random.rand(16).reshape(4,4),index=[one,two,three,four],columns=[a,b,c,d])
print(df)
print()
deldf[a]#删除列改变原数组
print(df)
abcd
one0....
two0..1670..
three0....
four0....
bcd
one0...
two0.1670..
three0...
four0...
df=pd.DataFrame(np.random.rand(16).reshape(4,4),index=[one,two,three,four],columns=[a,b,c,d])
print(df)
print()
d1=df.drop(one)#删除行并返回新的数组不改变原数组
print(d1)
print()
print(df)
abcd
one0....
two0.4260...
three0....
four0....
abcd
two0.4260...
three0....
four0....
abcd
one0....
two0.4260...
three0....
four0....
df=pd.DataFrame(np.random.rand(16).reshape(4,4),index=[one,two,three,four],columns=[a,b,c,d])
print(df)
print()
d2=df.drop(a,axis=1)#删除列返回新的数组不会改变原数组
print(d2)
print()
print(df)
abcd
one0..6130..
two0....
three0...1760.
four0....
bcd
one0.6130..
two0...
three0..1760.
four0...
abcd
one0..6130..
two0....
three0...1760.
four0....
排序
根据指定列的列值排序同时列值所在的行也会跟着移动.sort_values([列])
#单列
df=pd.DataFrame(np.random.rand(16).reshape(4,4),columns=[a,b,c,d])
print(df)
print()
print(df.sort_values([a]))#默认升序
print()
print(df.sort_values([a],ascending=False))#降序
abcd
00....
10....
20....
30...9910.
abcd
10....
30...9910.
00....
20....
abcd
20....
00....
30...9910.
10....
根据索引排序.sort_index()
df=pd.DataFrame(np.random.rand(16).reshape(4,4),index=[2,1,3,0],columns=[a,b,c,d])
print(df)
print()
print(df.sort_index())#默认升序
print()
print(df.sort_index(ascending=False))#降序
abcd
20.6110...
10....
30..3540..
00...1990.
abcd
00...1990.
10....
20.6110...
30..3540..
abcd
30..3540..
20.6110...
10....
00...1990.
df=pd.DataFrame(np.random.rand(16).reshape(4,4),index=[x,z,y,t],columns=[a,b,c,d])
print(df)
print()
print(df.sort_index())#根据字母顺序表排序
abcd
x0....
z0...2.
y0....
t0....
abcd
t0....
x0....
y0....
z0...2.
df=pd.DataFrame(np.random.rand(16).reshape(4,4),index=[three,one,four,two],columns=[a,b,c,d])
print(df)
print()
print(df.sort_index())#根据单词首字母排序
abcd
three0....
one0...9160.
four0....
two0....
abcd
four0....
one0...9160.
three0....
two0....