Python商業(yè)數(shù)據(jù)分析代碼_第1頁(yè)
Python商業(yè)數(shù)據(jù)分析代碼_第2頁(yè)
Python商業(yè)數(shù)據(jù)分析代碼_第3頁(yè)
Python商業(yè)數(shù)據(jù)分析代碼_第4頁(yè)
Python商業(yè)數(shù)據(jù)分析代碼_第5頁(yè)
已閱讀5頁(yè),還剩55頁(yè)未讀, 繼續(xù)免費(fèi)閱讀

下載本文檔

版權(quán)說(shuō)明:本文檔由用戶提供并上傳,收益歸屬內(nèi)容提供方,若內(nèi)容存在侵權(quán),請(qǐng)進(jìn)行舉報(bào)或認(rèn)領(lǐng)

文檔簡(jiǎn)介

1.商業(yè)數(shù)據(jù)分析及其工具

importpandasaspd

#讀取本地?cái)?shù)據(jù)

df=pd.readcsvC,/2glkx/data/al2-l.csv*)

#讀取網(wǎng)上數(shù)據(jù)

importpandasaspd

data_url=

“https:〃raw.githubusercontent.com/alstat/Analysis-with-Programming/master/2014

/Python/Numorical-Dcscriptions-of-the-Data/data.csv”

df=pd.read_csv(data_url)

print(df.head())

print(df.tai1())

print(df.columns)

Index([u'Abra',u'Apayao',u'Benguet',u*Ifugao*,u*Kalingcf],dtype=,object*)

#Extractingrownamesortheindex

print(df.index)

Rangeindex(start=0,stop=79,step=l)

#Transposedata

print(df.T)

print(df.ix[:,0].head())現(xiàn)在改版ix用loc

print(df.ix[10:20,0:3])

print(df.drop(df.columns[[2,3]],axis=1).head())

print(df.describe0)

fromscipyimportstatsasss

#Performonesamplet-testusing1500asthetruemean

print(ss.ttest1samp(a=df.ix[:,'Abra'],popmean=15000))

TtestlsampResult(statistical.1281738488299586,pvalue=0.26270472069109496)

print(ss.ttcst_lsamp(a=df,popmean=15000))

importmatplotlib.pyplotaspit

pit.show(df.plot(kind='box'))

1

importseabornassns

#Dotheboxplot

pit.show(sns.boxplot(df))

defadd_2int(x,y):

returnx+y

print(add_2int(2,2))

2Python商業(yè)數(shù)據(jù)的存取

importpandasaspd

importnumpyasnp

apple*,*pear','watch*,'money,]

b=[[l,2,3,4,5],[5,7,8,9,0],[1,3,5,7,9],[2,4,6,8,0]]

d=dict(zip(a,b))

d

p=pd.DataFramc(d)

P

p.to_csv(*F:\\2glkx\\data\\IBM.csv*)

pd.readcsv('F:\\2glkx\\data\\al2-l.csv')

importpandasaspd

importnumpyasnp

df=pd.read_excel(*F:\\2glkx\\data\\al2-2.xls')

df.headO

importpandasaspd

importnumpyasnp

#讀取數(shù)據(jù)并創(chuàng)建數(shù)據(jù)表,名稱為data。

data=pd.DataFrame(pd.readexcel(JG:\\2glkx\\data\\al2-2.xls,))

#查看數(shù)據(jù)表前5行的內(nèi)容

data,head()

importtushareasts

df=ts.get_hist_data(?000875,)#從網(wǎng)上取數(shù)據(jù)

#直接保存

#df.to_csv(,:/2glkx/data/000875.csv')

#選擇數(shù)據(jù)保存

df.to_csv('F:/2glkx/data/000875.csv',columns=[,open','high','low','close*])

importpandasaspd

importnumpyasnp

df=pd.readcsv(*F:/2glkx/data/000875.csv')

2

df.head()

importtushareasts

importos

filename=*F:/2glkx/data/bigfile.csv)

forcodeinf000875,,*600848,,'000981']:

df=ts.gethistdata(code)

ifos.path,exists(filename):

itdf.to_csv(filename,mode='a',header二None)

df.to_csv(filoname,modo=,a')

else:

df.to_csv(filename)

importpandasaspd

importnumpyasnp

df=pd.rcadexccl('F:/2glkx/data/000875.xls')

df.head()

importtushareasts#需先安裝tushare程序包

#此程序包的安裝命令:pipinstalltushare

importpandasaspd

importnumpyasnp象中

data=pd.DataFrame()

datal=ts.gct_hist_data(,600000,)

并需要修改上面的時(shí)間

datal=datalfclose,]

datal=datal[::-l]#按日期從遠(yuǎn)到近結(jié)束

dataf600000']=datal

data2=ts.got_hist_data(,0009801)

data2=data2fclose*]

data2=data2[::-l]

data['000980']=data2

data3=ts.get_hist_data(,000981,)

data3=data3['close']

data3=data3[::-l]

dataf00098T]=data3

data,info0#查看數(shù)據(jù)情況

data=dala.dropna0

data,info()

data,head0

data,tail()

3

data=dataEf600000),‘000981']]

data,head()

data.ix[l:4]#現(xiàn)在ix改為loc

data.iloc[:2,:3]

importtushareasts

importpandasaspd

pd.setoption(*expand_frame_repr,,False)#顯示所有列

ts.settokenCyourtoken*)#獲取token號(hào),需要先注冊(cè)

pro=_api0

stockdata=pro.daily(ts_code=,000001.SZ*,startdate=,20100101,,

end_date=,20190101')

stockdata,head()

importpandasdatareader.dataasweb

importdatetime

start=datetime.datetime(2017,1,1)#獲取數(shù)據(jù)的時(shí)間段起始時(shí)間

end=datetime,date.today()#獲取數(shù)據(jù)的時(shí)間段-結(jié)束時(shí)間

stock=web.DataReader(,,600797.SS^,“yahoo”,start,end)

#獲取浙大網(wǎng)新2017年1月1日至今的股票數(shù)

stock.head()#打印DataFrame數(shù)據(jù)前5行

importpandasaspd

frompandasimportSeries,DataFramc

importnumpyasnp

importmatplotlib.pyplotaspit

frompandas_datareaderimportdata,wh

fromdatetimeimportdatetime

end=datetime.no\v()

start=datetime(end.year-1,end.month,end.day)

df=data.DataReadcrC*600797.SS*,'yahoo',start,end)

dffAdjClose*].plot(legend=True,figsize=(10,4))

pit.show()

importnumpyasnp

importpandasaspd

importpandasdatareader.dataasweb

importdatetime

#獲取600797.SS浙大網(wǎng)新數(shù)據(jù)

4

dfcsvsave=

web.DataReader(“600018.SS","yahoo”,datetime.datetime(2019,1,1),datetime,date.to

day())

print(df_csvsave)

dfcsvsave.tocsv(r*F:\2glkx\data\600018.csv',columns=dfcsvsave,columns,index=

True)

importtushareasts

importpandasaspd

pd.set_option(,expand_frame_repr,,False)#顯示所有列

ts.set_token(,yourtoken')#獲取token號(hào)

pro=ts.proapi()

code」ist=f000001.SZ\*600000.SH1,'()00002.ST]

stockdata=pd.DataFrameO

forcodeincode_list:

print(code)

df=pro.daily(ts_codc=codc,start_date=,2018010T,end_date=,20180104))

stock_data=stock_datci.cippend(df,ignore_index=True)

print(stockdata)

grouped=stock_data.groupby(*ts_code')

print(grouped)

3Python商業(yè)數(shù)據(jù)的圖形繪制與可視化

importmatplotlib.pyplotaspit

importpandasaspd

importnumpyasnp

df=pd.readexcel("G:/2glkx/data/al3-1.xls〃)

#或者df=pd.read_excelCG:\\2glkx\\data\\al3-l.xls')

df.headO

fig=plt.figureO

ax=fig.addsubplot(1,L1)

ax.hist(df[JAge*],bins=7)

pit.show()

fig=plt.figureO

ax=fig.addsubplot(1,L1)

ax.hist(df[*Age'],bins=7)

pit.titleCAgedistribution')

5

pit.show()

fig=plt.figure()

ax=fig.add_subplot(l,1.1)

ax.hist(df[*Age*],bins=7)

plt.titleC1Agedistribution')

pit.xlabel(,Age*)

pit.ylabel(JttEmployee*)

pit.show()

fig=plt.figure()

ax二fig.add_subplot(l,1,1)

ax.scatter(dffAge*],df['Sales'])

#Youcanalsoaddmorevariablesheretorepresentcolorandsize.

pit.title(*Age&SalesScatterofEmployee1)

^Variable

pit.xlabel(*Age*)

pit.ylabel(*Sales,)

pit.show()

fig=plt.figure()

ax=fig.addsubplot(1,1,1)

ax.scatter(df[JAge'],df[,Sales*],s=df['Income,])

#Addedthirdvariableincomeassizeofthebubble

pit.xlabel(JAge*)

pit.ylabel('Sales*)

pit.show()

importmatplotlib.pyplotaspit

importpandasaspd

fig二pit.figure()

ax=fig.addsubplot(1,L1)

^Variable

ax.boxplot(dffAge1])

pit.BoxfigureofAge')

pit.show()

vars=[*Age'Sales']

data=df[vars]

pit.show(delta,plot(kind='box'))

importmatplotlib.pyplotaspit

importpandasaspd

6

importnumpyasnp

df=pd.read_excel(^G:/2glkx/data/al3-l.xls")

df.head()

var=df.groupby(['Gender>1).sum().stack()

temp=var.unstack()

x_list=temp[,Sales*]

label_list=temp.index

pit.axis("equal")

pit.pie(xlist)

pit.titie(,zPastafatianismexpenses")

pit.show()

frompylabimport*

#makeasquarefigureandaxes

figure(1,figsize=(6,6.))

ax=axcs([0.1,0.1,0.8,0.8])

fracs=[60,40]#每一塊占得比例,總和為100

explode=(0,0.08)#離開(kāi)整體的距離,看效果

labels='Male',*Female,#對(duì)應(yīng)每一塊的標(biāo)志

pie(fracs,explode=explode,labels=labels,autopct=,%l.If%%,,shadow=True,

startangle=90,colors=("g","r"))

title(*RateofMaleandFemale*)#標(biāo)題

show()

importmatplotlib.pyplotasplot

importpandasaspd

importnumpyasnp

df=pd.readexcel("G:/2glkx/data/a13T.xls")

df.headO

var=df.groupby(*Gender').Sales.sum()

^groupedsumofsalesatGenderlevel

fig=pit.figure0

axl=fig.addsubplot(1,1,1)

axl.set_xlabelCGender")

axl.set_ylabel('SumofSales')

axl.settitie(^GenderwiseSumofSales")

var.plot(kind='bar*)

var=df.groupby(fBMT,Gender']).Sales.sum()

var.unstack().plot(kind=,bar*,stacked=True,color=[,red','blue*])

importmatplotlib.pyplotasplot

7

importpandasaspd

importnumpyasnp

df=pd.readexcel(,ZG:/2glkx/data/al3-l.xls")

df.head()

var=df.groupbyCBMT).Sales,sum()

fig=plt.figure()

axl=fig.add_subplot(1,1,1)

axl.set_xlabel(*BMT)

axl.sctylabcl(*SumofSales*)

axl.set_title(,,BMIwiseSumofSales")

var.plot(kind』line')

importpandasaspd

importnumpyasnp

data二pd.DataFrame(pd.read_excel(*G:\\2glkx\\data\\al3-3.xls'))

data,head()

t=np.array(data[year*]])

x=np.array(data[[*total*]])

y=np.array(data[[*new*]])

importpylabaspl

pl.plot(t,x)

pl.plot(t,y)

pl.show()

importpylabaspl

pl.plot(t,x)

pl.plot(t,y)

pl.titleC5populationcensus')

pl.xlabel('Time')

pl.ylabel('PopulatioiV)

pl.show()

importpandasaspd

importnumpyasnp

data=pd.DataFrame(pd.read_excel(?G:\\2glkx\\data\\al3-4.xls'))

data,head()

t=np.array(data[fyear,]])

x=np.array(data[['number,]])

importpylabaspl

pl.plot(t,x)

pl.title(>1998-2015ofAlistedcompaniesinchina))

pl.xlabelCTime*)

pl.ylabel('companiesnumbers*)

8

pl.show()

importpandasaspd

importnumpyasnp

importpylabaspl

data=pd.DataFrame(pd.read_excelG:\\2glkx\\data\\al3-4.xls'))

data,head()

t=np.array(data[['year']])

x=np.array(data[['number']])

importpylabaspl

pl.plot(t,x,'ro')

pl.title(>1998-2015ofAlistedcompaniesinchina,)

pl.xlabel('Time')

pl.ylabelCcompaniesnumbers,)

pl.show()

importrandom

importnumpyasnp

importmatplotlibasmpl

importmatplotlib.pyplotaspit

importmatplotlib.datesasmdates

frommpl_toolkits.mplol3dimportAxes3I)

mpl.rcParams[,font,size*]=10

fig=pit.figure()

ax=fig.add_subplot(111,projection='3d')

forzin[2011,2012,2013,2014]:

xs=ranged,13)

ys=1000*np.random,rand(12)

color=plt.cm.Set2(random,choice(range(pit.cm.Set2.N)))

ax.bar(xs,ys,zs=z,zdir=,y,,color=color,alpha=0.8)

ax.xaxis.setmajorlocator(mpl.ticker.FixedLocator(xs))

ax.yeixis.set_major_locator(mpl.ticker.FixedLocator(ys))

ax.setxlabel('Month')

ax.set_ylabel('Year*)

ax.setzlabel(*SalesNet[usd]*)

pit.show()

frommpltoolkits.mplot3dimportAxes3D

frommatplotlibimportcm

importmatplotlib.pyplotaspit

importnumpyasnp

nangles=36

n_radii=8

#Anarrayofradii

9

#Doosnotincluderadiusr=0,thisistoeliminateduplicatepoints

radii=np.1inspace(0.125,1.0,n_radii)

#Anarrayofangles

angles=np.1inspace(0.2*np.pi,n_cingles,endpoint=False)

#Repeatallanglesforeachradius

angles=np.repeat(angles,np.newaxis],n_radii,axis=l)

#Convertpolar(radii,angles)coordstocartesian(x,y)coords

#(0,0)isaddedhere.Therearenoduplicatepointsinthe(x,y)plane

x=np.append(0,(radii*np.cos(tingles)).flatten())

y=np.append(0,(radii*np.sin(anglcs)).flatten())

#Pringlesurface

z=np.sin(-x*y)

fig=pit.figureO

ax=fig.gca(projection^3d1)

ax.plot_trisurf(x,y,z,cmap=cm.jet,1inewidth=O.2)

pit.show()

4Python描述性統(tǒng)計(jì)

#兩個(gè)常用的統(tǒng)計(jì)包

importscipy.statsasstats

importnumpyasnp

n我們拿兩個(gè)數(shù)據(jù)集來(lái)舉例

xl=[1,2,2,3,4,5,5,7]

x2=xl+[100]

print('xl的平均值:',sum(xl),',len(xl),,=,,np.mean(xl))

print('x2的平均值:',sum(x2),*/*,len(x2),,=,,np.mean(x2))

xl的平均值:29/8=3.625

x2的平均值:129/9=14.333333333333334

print(*xl的中位數(shù):',rp.median(xl))

print(*x2的中位數(shù):',r.p.median(x2))

print(,Onemodeofxl,stats,mode(xl)[0][0])

#因此我們自定義一個(gè)求眾數(shù)的函數(shù)

defmode(l):

#統(tǒng)計(jì)列表中每個(gè)元素出現(xiàn)的次數(shù)

counts={}

forein1:

ifeincounts:

counts[e]+=1

else:

10

counts[e]=1

#返回出現(xiàn)次數(shù)最多的元素

maxcount=0

modes-{)

for(key,value)incounts,iterns():

ifvalue>maxcount:

maxcount=value

modes={key}

clifvalue==maxcount:

modes,add(key)

ifmaxcount>1orlen(1)==1:

returnlist(modes)

return'Nomode,

print(*Allofthemodesofxl,mode(xl))

importscipy.statsasstats

importnumpyasnp

#獲取收益率數(shù)據(jù)并計(jì)算出mode

start=*2014-01-0r

end='2015-01-01'

pricing=D.historydata('000002.SZA',fields=[,close*],start_date=start,

end_date=end)['close*]

returns=pricing.pct_changc()[1:]

print('收益率眾數(shù):',stats,mode(returns))

#由于所有的收益率都是不同的,所以我們使用頻率分布來(lái)變相計(jì)算mode

hist,bins=np.histogram(returns,20)#將數(shù)據(jù)分成20個(gè)bin

maxfreq=max(hist)

n找出哪個(gè)bin里面出現(xiàn)的數(shù)據(jù)點(diǎn)次數(shù)最大,這個(gè)bin就當(dāng)做計(jì)算出來(lái)的mode

print(*Modeofbins:',[(bins[i],bins[i+l])fori,jinenumerate(hist)ifj==

maxfreq])

#使用Scipy包中的gmesn函數(shù)來(lái)計(jì)算幾何平均值

printCxl幾何平均值:stats,gmean(xl))

print(*x2幾何平均值:',stats,gmean(x2))

#在每個(gè)元素上增加1來(lái)計(jì)算幾何平均值

importscipy.statsasstats

importnumpyasnp

ratios=returns+np.ones(len(returns))

11

R_G=stats,gmcan(ratios)-1

print('收益率的幾何平均值:',RG)

T=len(returns)

initprice=pricingfO.

final_price=pricing[T]

print('最初價(jià)格:,,initprice)

print('最終價(jià)格:',final_price)

print('通過(guò)幾何平均收益率計(jì)算的最終價(jià)格:',init_price*(l+R_G)**T)

print(*xl的調(diào)和平均值:',stats,hmean(xl))

print(1x2的調(diào)和平均值:',stats.hmean(x2))

importnumpyasnp

np.random.seed(121)

#生成20個(gè)小于100的隨機(jī)整數(shù)

X=np.random,randint(100,size=20)

#Sortthem

X=np.sort(X)

printCX:%s'%(X))

mu=np.mean(X)

print('X的平均值:mu)

print(*RcingeofX:%s'%(np.ptp(X)))

abs_dispersion=[np.abs(mu-x)forxinX]

MAD=np.sum(abs_dispersion)/len(abs_dispersion)

print('X的平均絕對(duì)偏差:‘,MAD)

printCX的方差:',np.var(X))

print('X的標(biāo)準(zhǔn)差:',np.std(X))

k=1.25#隨便舉的一個(gè)k值

dist=k*np.std(X)

1=[xforxinXifabs(x-mu)<=dist]

print('k值',k,'在k倍標(biāo)準(zhǔn)差距離內(nèi)的樣本為:',內(nèi)

print('驗(yàn)證float(len(1))/len(X),J>>,1-l/k**2)

#沒(méi)有現(xiàn)成的計(jì)算下偏方差的函數(shù),因此我們手動(dòng)計(jì)算:

lows=[eforeinXife<=mu]

semivar=np.sum((lows-mu)**2)/len(lows)

print('X的下偏方差:',semivar)

print(*X的下偏標(biāo)準(zhǔn)差:’,np.sqrt(semivar))

12

B=19#目標(biāo)為19

lowsB=[eforeinXife<=B]

semivar_B=sum(map(lambdax:(x-B)**2,lows_B))/len(lows_B)

print(1X的目標(biāo)下偏方差:',semivar_B)

print('X的目標(biāo)下偏標(biāo)準(zhǔn)差:’,np.sqrt(semivar_B))

importmatplotlib.pyplotaspit

importnumpyasnp

importscipy.statsasstats

xs=np.1inspace(-6,6,300)

normal=stats,norm,pdf(xs)

pit.plot(xs,normal);

#產(chǎn)生數(shù)據(jù)

xs2=np.1inspace(stats.lognorm,ppf(0.01,.7,loc=-.1),stats,lognorm,ppf(0.99,.7,

loc=-.1),150)

#偏度>0

lognormal=stats,lognorm,pdf(xs2,.7)

pit.plot(xs2,lognormal,label='Skew>O')

#偏度<0

pit.plot(xs2,lognormal[::-l],label=,Skew<O')

pit.legendO;

并注:本程序在Bigquanl環(huán)境中運(yùn)行。

start='2016-01-01'

end='2018-01-01'

pricing=D.historydata(1000300.SHA',startdate=start,enddate=end,)['close*]

returns=pricing.pct_chango()[1:]

print(*Skew:),stats,skew(returns))

print(,Mean:*,np.mean(returns))

print(*Median,np.median(returns))

pit.hist(returns,30);

fromscipyimportstats

frompandas,coreimportdatetools

importstatsmodels.apiassm#統(tǒng)計(jì)相關(guān)的庫(kù)

importnumpyasnp

importpandasaspd

importmatplotlib.pyplotaspit

importtushareasIs#財(cái)經(jīng)數(shù)據(jù)接口包tushare

IndexData=ts.get_k_data(code=,hs300*,start=,2016-01-0T,end=,2018-08-0T)

13

IndcxData.index=pd.to_datetimc(IndexData.date)

close=IndexDcita.close

returns=(close-close,shift(1))/close.shift(l)

returns=returns.dropnaO

print('Skew:*,stats,skew(returns))

print('Mean:,np.mean(returns))

print(*Median:),np.median(returns))

pit.hist(returns,30)

pit.plot(xs,stats.laplace,pdf(xs),label=,Leptokurtic,)

print尖峰的超額峰度:',(stats,laplace,stats(moments=,k')))

pit.plot(xs,normal,label=,Mesokurtic(normal),)

print(J正態(tài)分布超額峰度:',(stats.norm,stats(moments=,k')))

pit.plot(xs,stats,cosine.pdf(xs),label=,Platykurtic*)

print('平峰超額峰度:',(stats,cosine,stats(moments=,k*)))

pit.legend();

fromstatsmodels.stats,stattoolsimportjarque_bera

pvalue,=jarquebera(returns)

ifpvalue>0.05:

printC滬深300收益率數(shù)據(jù)服從正態(tài)分布.’)

else:

printC滬深300收益率數(shù)據(jù)并不服從正態(tài)分布.’)

fields=[,fs_roc_0,]

start_date=>2017-04-21,

enddate=J2017-04-21

instruments=I),instruments(start_date,enddate)

roe=D.features(instruments,startdate,enddate,fields二fields)['fsroe01]

1描述性統(tǒng)計(jì)

print('均值:',roe.mean())

print('標(biāo)準(zhǔn)差:',roe.std())

roe.describe()

2繪制直方圖

roe.hist(bins=100)

1固定比例法

roe=D.features(instruments,startdatc,enddate,fields=fields)[*fspoeOl]

roe[roe>=roe.quantile(0.99)]=roe.quantile(0.99)

roe[roe<=roe.quantile(0.01)]=roe.quantile(0.01)

print('均值:,roe.mean。)

print('標(biāo)準(zhǔn)差:',roe.std())

14

roe.hist(bins=100)

2均值標(biāo)準(zhǔn)差方法

通常把三倍標(biāo)準(zhǔn)差之外的值都視為異常值,然后將這些異常值重新賦值

roe=0.features(instruments,start_date,end_date,fields:fields)['fs_roe_0']

roe[roe>=roe.mean()+3*roe.std()]=roe.mean()+3*roe.std()

roe[roe<=roe.mean()-3*roe.std()]=roe.mean()-3*roe.stdO

print('均值:',roe.mean。)

print('標(biāo)準(zhǔn)差:',roe.std())

roo.hist(bins=100)

3MAI)方法

roe=D.features(instruments,startdate,enddate,fields=fields)fsroeO1]

roe=roe.dropnaO

median=np.median(list(roe))

MAD=np.mean(abs(roe)-median)

roe=roe[abs(roe-median)/MAD<=6]#剔除偏離中位數(shù)6倍以上的數(shù)據(jù)

print('均值:',roe.mean。)

print('標(biāo)準(zhǔn)差:',roe.$td())

roe.hist(bins=100)

fromstatsmodels.stats,stattoolsimportmedcouple

roe=D.features(instruments,start_date,enddate,fields=fields)fs_roe_01]

roe=roe.dropnaO

defboxplot(data):

#mc可以使用statsmodels包中的medcouple函數(shù)直接進(jìn)行計(jì)算

me=medcouple(data)

data,sort()

ql=data[int(0.25*len(data))]

q3=data[int(0.75*len(data))]

iqr=q3-ql

ifme>=0:

1=ql-1.5*np.exp(-3.5*me)*iqr

u=q3+1.5*np.exp(4*me)*iqr

else:

1=ql-1.5*np.exp(-4*me)*iqr

u=q3+1.5*np.exp(3.5*me)*iqr

data=pd.Series(data)

data[data<1]=1

data[data>u]=u

returndata

print('均值',boxplot(list(roe)).mean())

printC標(biāo)準(zhǔn)差',boxplot(list(roe)).std())

boxplot(1ist(roe)).hist(bins=100)

15

5Python參數(shù)估計(jì)

importnumpyasnp

x=[l,1,0,1,0,0,1,0,1,LI,0,1,1,0,1,0,0,1,0,1,0,1,0,0,1,1,0,1,1,0,1]

theta=np.mean(x)

h=theta/(l-theta)

printCh=',h)

5.3Python單jf態(tài)總體均值區(qū)間估計(jì)

importnumpyasnp

importscipy.statsasss

n=6;p=0.025;sigma=np.sqrt(0.6)

x=[14.6,15.1,14.9,14.8、15.2,15.1]

xbar=np.mean(x)

low=xbar-ss.norm,ppf(q=1-p)*(sigma/np.sqrt(n))

up=xbar+ss.norm,ppf(q=1-p)*(sigma/np.sqrt(n))

print('low=',low)

print('up=,,up)

2.方差。2未知時(shí)〃的置信區(qū)間

importnumpyasnp

importscipy.statsasss

fromscipy.statsimportt

n=9;p=0.025;s=np.sqrt(1.47)

x=[99.3,98.7,100.5,101.2,98.3,99.7,99.5,102.1,100.5]

xbar二np.mean(x)

low=xbar-ss.t.ppf(l-p,n-l)*(s/np.sqrt(n))

up=xbar+ss.t.ppf(1-p,n-l)*(s/np.sqrt(n))

print('low=',low)

print('up=,,up)

fromscipyimportstats

importnumpyasnp

x=[99.3,98.7,100.5,101.2,98.3,99.7,99.5,102.1,100.5]

stats,t.interval(0.95,len(x)-l,np.mean(x),stats,sem(x))

(99.04599342616191,100.90956212939363)

5.4Python單正態(tài)總體方差區(qū)間估計(jì)

fromscipy.statsimportchi2

n=16;sq=0.0023;p=0.025

low=((n-l)*sq)/chi2.ppf(1-p,n-l)

up=((n-l)*sq)/chi2.ppf(p,n-l)

print(*low=,,low)

16

print('up=,,up)

5.5Python雙正態(tài)總體均值差區(qū)間估計(jì)

importnumpyasnp

importscipy.statsasss

x=[628,583,510,554,612,523,530,615]

y=[535,433,398,470,567,480,498,560,503,426]

nl=len(x);n2=len(y)

xbar=np.mean(x);ybar=np.mean(y)

sigmaql=2140;sigmaq2=3250;p=0.025

low=xbar-ybar-ss.norm,ppf(q=1-p)*np.sqrt(sigmaq1/n1+sigmaq2/n2)

up=xbar-ybar+ss.norm,ppf(q=1-p)*np.sqrt(sigmaql/nl+sigmaq2/n2)

print(*low=,,low)

printup=,,up)

2.兩方差都未知時(shí)兩均值的置信區(qū)間

importnumpyasnp

importscipy.statsusss

x=[628,583,510,554,612,523,530,615]

y=[535,433,398,470,567,480,498,560,503,426]

nl=l.0*len(x);n2=l.0*len(y)#轉(zhuǎn)為小數(shù)

sl=np.var(x);s2=np.var(y)

xbar=np.mean(x);ybar=np.mean(y)

p=0.025

sq=((nl-1)*s1+(n2-l)*s2)/(nl-l+n2-l)

low=xbar-ybar-ss.t.ppf(1-p,nl+n2-2)*np.sqrt(sq*(l/nl+l/n2))

up=xbar-ybar+ss.t.ppf(1-p,nl+n2-2)*np.sqrt(sq*(l/nl+l/n2))

print('low=,,low)

printup=,,up)

5.6Python雙正態(tài)總體方差比區(qū)間估計(jì)

importnumpyasnp

fromscipy.statsimportf

x=[20.5,19.8,19.7,20.4;20.1,20.0,19.0,19.93

y=[20.7,19.8,19.5,20.8,20.4,19.6,20.2]

sql=np.var(x);sq2=np.var(y)

nl=8;n2=7;p=0.025

f.ppf(0.025,nl-1,n2-l)

low=sql/sq2*l/f.ppf(1-p,nl-l,n2-l)

up=sql/sq2*l/f.ppf(p,nl-1,n2-l)

printClow=5,low)

print('up=',up)

low=0.142168867371

up=4/p>

17

6Python參數(shù)假設(shè)檢驗(yàn)

6.2Python單個(gè)樣本t檢驗(yàn)

importpandasaspd

importnumpyasnp

#讀取數(shù)據(jù)并創(chuàng)建數(shù)據(jù)表,名稱為data。

data=pd.DataFrame(pd.read_excel(*G:\\2glkx\\data\\al6-l.xls'))

#查看數(shù)據(jù)表前5行的內(nèi)容

data,head()

#取sale數(shù)據(jù)

x=np.array(data[[*sale,]])

mu=np.mean(x)

fromscipyimportstatsasss

printmu,ss.ttcst_lsamp(a=x,popmean=500)

6.3Python兩個(gè)獨(dú)立樣本t檢驗(yàn)

importnumpyasnp

#讀取數(shù)據(jù)并創(chuàng)建數(shù)據(jù)表,名稱為data。

data=pd.DataFrame(pd.readexcel(,G:\\2glkx\\data\\al6-2.xls'))

#查看數(shù)據(jù)表前5行的內(nèi)容

x=np.array(datafa*]])

y=np.array(data[['fb']])

fromscipy.statsimportttestind

t,p=ttest_ind(x,y)

print't=',t

print,p=,,p

6.4Python配對(duì)樣本t檢險(xiǎn)

importpandasaspd

importnumpyasnp

#讀取數(shù)據(jù)并創(chuàng)建數(shù)據(jù)表,名稱為data。

data二pd.DataFrame(pd.readexcel('G:\\2glkx\\data\\al6-3.xls*))

#查看數(shù)據(jù)表前5行的內(nèi)容

x=np.array(data[[*qian*]])

y=np.array(datahou,]])

fromscipy.statsimportttest_rel

18

t,p=ttest_rel(x,y)

print't=',t

print,p=,,p

6.5Python單樣本方差假設(shè)檢臉

importpandasaspd

importnumpyasnp

#讀取數(shù)據(jù)并創(chuàng)建數(shù)據(jù)表,名稱為data。

data=pd.DataFrame(pd.read_excelCG:\\2glkx\\data\\al6-4.xls'))

#查看數(shù)據(jù)表前5行的內(nèi)容

data,head0

#取收益率數(shù)據(jù)

importnumpyasnp

x=np.array(data[[,syl*]])

n=len(x)

#計(jì)算方差

s2=np.var(x)

#計(jì)算卡方值

chisquare=(n-l)*s2/0.01

printchisquare

python實(shí)現(xiàn)卡方分布

fromscipyimportstats

obs=[102,102,96,105,95,100]

exp=[100,100,1D0,100,100,100]

stats.chisquare(obs,fexp=exp)

6.6Python雙樣本方差假設(shè)檢驗(yàn)

importpandasaspd

importnumpyasnp

fromscipyimportstats

fromstatsmodels.formula.apiimportols

fromstatsmodels.stats.anovaimportanovalm

在目錄G:\2glkx\data下建立a16-5.xls數(shù)據(jù)文件后,取數(shù)的命令如下:

#讀取數(shù)據(jù)并創(chuàng)建數(shù)據(jù)表,名稱為data。

df=pd.DataFrame(pd.read_excelG:\\2glkx\\data\\al6-5.xls'))

#查看數(shù)據(jù)表前5行的內(nèi)容

df.head()

19

formula='rcturnA"returnB,《隔離因變量和自變量(左邊因變量,右邊自變展)

model=ols(formula,df).fit()#根據(jù)公式數(shù)據(jù)建模,擬合

results=anova_Im(model)#計(jì)算F和P

printresults

7Python相關(guān)分析

7.2使用模擬數(shù)據(jù)計(jì)算變量之間的相關(guān)系數(shù)和繪圖

#導(dǎo)入包

importnumpyasnp

importstatsmodels.tsa.stattoolsassts

importmatplotlib.pyplotaspit

importpandasaspd

importseabornassns

importstatsmodels.apiassm

(1)生成隨機(jī)變數(shù)并繪制圖形

X=np.random.randn(lODO)

Y=np.random.randn(lODO)

pit.scatter(X,Y)

pit.show()

print(^correlationofXandYis")

np.corrcoef(X,Y)[0,1]

X=np.random.randn(lOOO)

Y=X+np.random,normal(0,0.1,1000)

pit.scatter(X,Y)

pit.show()

print(^correlationofXandYis")

np.corrcoef(X,Y)[0,1]

7.3使用本地?cái)?shù)據(jù)計(jì)算變量之間的相關(guān)系數(shù)和繪圖

importpandasaspd

importnumpyasnp

#讀取數(shù)據(jù)并創(chuàng)建數(shù)據(jù)表,名稱為data。

data=pd.DataFrame(pd.read_excel(,G:\\2glkx\\data\\al7-l.xls,))

#查看數(shù)據(jù)表前5行的內(nèi)容

data,head0

timeadvsale

013550

20

1250100

2356120

3468180

4570175

#取adv和sale數(shù)據(jù)

x=np.array(data[fadv*]])

y=np.array(data[fsale*]])

importscipy.stats,statsasstats

r=stats.pearsonr(x.y)[0]

Print(r)

7.4使用網(wǎng)上數(shù)據(jù)計(jì)算變量之間的相關(guān)系數(shù)和繪圖

#本程序需在Bigquant平臺(tái)中運(yùn)行

#計(jì)算兩只股票的日收益率

#中國(guó)鐵建數(shù)據(jù)

Stockl=

D.history_data(["601186.SHA"],start_date=,2016-12-0T,end_dato=,2017-05-01,,fie

Ids=fclose*])Vclose1].pct_change()[1:]

#中國(guó)中鐵數(shù)據(jù)

Stock2=

D.history_data([z,60139D.SHA"],start_date=2016-12-01',end_date=,2017-05-01,,fie

Ids=fclose*])close*].pct_change()[1:]

pit.scatter(Stockl,Stock2)

pit.xlabel(z,601186.SHAdailyreturn*)

plI.ylabel(“601390.SHAdailyreturn*)

pit.show()

print(^thecorrlationfortwostocksis:")

Stock2.corr(

溫馨提示

  • 1. 本站所有資源如無(wú)特殊說(shuō)明,都需要本地電腦安裝OFFICE2007和PDF閱讀器。圖紙軟件為CAD,CAXA,PROE,UG,SolidWorks等.壓縮文件請(qǐng)下載最新的WinRAR軟件解壓。
  • 2. 本站的文檔不包含任何第三方提供的附件圖紙等,如果需要附件,請(qǐng)聯(lián)系上傳者。文件的所有權(quán)益歸上傳用戶所有。
  • 3. 本站RAR壓縮包中若帶圖紙,網(wǎng)頁(yè)內(nèi)容里面會(huì)有圖紙預(yù)覽,若沒(méi)有圖紙預(yù)覽就沒(méi)有圖紙。
  • 4. 未經(jīng)權(quán)益所有人同意不得將文件中的內(nèi)容挪作商業(yè)或盈利用途。
  • 5. 人人文庫(kù)網(wǎng)僅提供信息存儲(chǔ)空間,僅對(duì)用戶上傳內(nèi)容的表現(xiàn)方式做保護(hù)處理,對(duì)用戶上傳分享的文檔內(nèi)容本身不做任何修改或編輯,并不能對(duì)任何下載內(nèi)容負(fù)責(zé)。
  • 6. 下載文件中如有侵權(quán)或不適當(dāng)內(nèi)容,請(qǐng)與我們聯(lián)系,我們立即糾正。
  • 7. 本站不保證下載資源的準(zhǔn)確性、安全性和完整性, 同時(shí)也不承擔(dān)用戶因使用這些下載資源對(duì)自己和他人造成任何形式的傷害或損失。

最新文檔

評(píng)論

0/150

提交評(píng)論