Filter out troughs based on distance between peaks
我有以下数据框:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | date Values 3/1/2018 3/3/2018 0 3/5/2018 -0.011630952 3/8/2018 0.024635792 3/10/2018 3/10/2018 0.013662755 3/13/2018 2.563770771 3/15/2018 0.026081264 3/17/2018 3/25/2018 4.890818119 3/26/2018 3/28/2018 0.994944572 3/30/2018 0.098569691 4/2/2018 4/2/2018 2.261398315 4/4/2018 2.595984459 4/7/2018 2.145072699 4/9/2018 2.401818037 4/11/2018 4/12/2018 2.233839989 4/14/2018 2.179880142 4/17/2018 0.173141539 4/18/2018 4/19/2018 0.04037559 4/22/2018 2.813424349 4/24/2018 2.764060259 4/27/2018 5/2/2018 4.12789917 5/4/2018 4.282546997 5/4/2018 5/7/2018 5.083333015 5/13/2018 5/14/2018 1.615991831 5/17/2018 0.250209153 5/19/2018 5.003758907 5/20/2018 5/22/2018 5/24/2018 0.177665412 5/29/2018 6/1/2018 3.190019131 6/3/2018 3.514900446 6/5/2018 2.796386003 6/6/2018 4.132686615 6/8/2018 6/11/2018 2.82530117 6/14/2018 6/16/2018 1.786619782 6/18/2018 6/21/2018 1.60535562 6/21/2018 1.737388611 6/23/2018 0.048161745 6/26/2018 1.811254263 6/28/2018 0.109187543 6/30/2018 7/1/2018 0.086753845 7/3/2018 2.141263962 7/6/2018 1.116563678 7/7/2018 1.159829378 7/8/2018 0.107431769 7/11/2018 -0.001963556 7/13/2018 7/16/2018 7/16/2018 0.071490705 7/18/2018 1.052834034 7/21/2018 7/23/2018 7/23/2018 1.201774001 7/28/2018 0.218167484 7/31/2018 0.504413128 8/1/2018 8/2/2018 8/5/2018 1.057194233 8/7/2018 0.85014987 8/8/2018 1.183927178 8/10/2018 1.226516366 8/12/2018 1.533656836 8/15/2018 8/17/2018 8/17/2018 1.355006456 8/20/2018 1.490438223 8/22/2018 8/24/2018 1.160542369 8/25/2018 1.546550632 8/27/2018 8/30/2018 |
看起来是这样的:
。
如果峰间距离小于14天,我想过滤掉峰间的所有波谷。例如,我想过滤掉
氧化镁
基于@asmus的建议,我希望在最终结果中有一个峰值,因此高斯分布可能是最好的(重点是可能)。
重要提示:因为这个答案已经很长时间了,所以我决定完全重写它,而不是第五次更新它。如果您对"历史背景"感兴趣,请查看版本历史记录。
首先,运行一些必需的导入:
1 2 3 4 5 6 7 8 9 10 | import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib import gridspec import matplotlib as mpl mpl.style.use('seaborn-paper') ## for nicer looking plots only from lmfit import fit_report from lmfit.models import GaussianModel, BreitWignerModel |
号
然后清除数据(如上所述,另存为.csv):
1 2 3 4 5 6 7 8 9 10 11 12 | df=pd.read_csv('pdFilterDates.txt',delim_whitespace=True) ## data as given above df['date'] = pd.to_datetime(df['date'],format = '%m/%d/%Y') ## initial cleanup df=df.dropna() ## clean initial NA values, e.g. 3/10/2018 ## there is a duplicate at datetime.date(2018, 6, 21): # print(df['date'][df.date.duplicated()].dt.date.values) df=df.groupby('date').mean().reset_index() ## so we take a mean value here # print(df['date'][df.date.duplicated()].dt.date.values) ## see, no more duplicates df = df.set_index('date',drop=False) ## now we can set date as index |
并按每日频率重新索引:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | complete_date_range_idx = pd.date_range(df.index.min(), df.index.max(),freq='D') df_filled=df.reindex(complete_date_range_idx, fill_value=np.nan).reset_index() ## obtain index values, which can be understood as time delta in days from the start idx=df_filled.index.values ## this will be used again, in the end ## now we obtain (x,y) on basis of idx not_na=pd.notna(df_filled['Values']) x=idx[not_na] ## len: 176 y=df_filled['Values'][not_na].values ### let's write over the original df df=df_filled ##### |
。
现在,对于有趣的部分:用一些不对称的线条形状(breit-wigner-fano)拟合数据,并删除低于某个阈值的"离群值"。我们首先手动声明这个峰值应该在哪里(我们的初步猜测是,我们可以去掉3个点),然后使用fit(fit 1)作为输入(去掉8个点),再次这样做,最后得到最终的fit。
根据要求,我们现在可以插入之前创建的每日索引的拟合(
。
氧化镁
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | # choose an asymmetric line shape (Fano resonance) bwf_model = BreitWignerModel() # make initial guesses: params = bwf_model.make_params(center=75, amplitude=0.2, sigma=20, q=1/0.2) # plot initial guess and fit result bwf_result = bwf_model.fit(y, params, x=x) ####------------------ create first figure---------- fig=plt.figure(figsize=(8,3),frameon=True,) gs1 = gridspec.GridSpec(1,3, left=0.08,right=0.95, bottom=0.15,top=0.9, wspace=0.1 ) a1=plt.subplot(gs1[0]) a2=plt.subplot(gs1[1]) a3=plt.subplot(gs1[2]) #------------------ first subplot ------------------ a1.set_title('Outliers from 1st guess') ## show initial x,y a1.scatter(x,y,facecolors='None',edgecolors='b',marker='o',linewidth=1,zorder=3) # outliers=np.argwhere(np.abs(y-bwf_result.init_fit)>1.9) ## if you want to exclude points both above and below outliers=np.argwhere(( bwf_result.init_fit -y ) >1.9) # remove outliers from point cloud x_new=np.delete(x,outliers) y_new=np.delete(y,outliers) #### run a fit on the"cleaned" dataset bwf_result_mod = bwf_model.fit(y_new, params, x=x_new) a1.plot(x, bwf_result.init_fit, 'r--',label='initial guess') a1.fill_between(x, bwf_result.init_fit, bwf_result.init_fit-1.9, color='r', hatch='///',alpha=0.2,zorder=1,label=u'guess - 1.9') a1.scatter(x[outliers],y[outliers],c='r',marker='x',s=10**2,linewidth=1,zorder=4,label='outliers') ## show outliers a1.plot(x_new, bwf_result_mod.best_fit, color='g',label='fit 1') pointsRemoved=len(y)-len(y_new) a1.text(1.05,0.5,u'↓{0} points removed'.format(pointsRemoved),ha='center',va='center',rotation=90,transform=a1.transAxes) #------------------ second plot ------------------ a2.set_title('Outliers from 1st fit') ## show initial x,y a2.scatter(x,y,facecolors='None',edgecolors='grey',marker='o',linewidth=.5,zorder=0,label='original data') a2.scatter(x_new,y_new,facecolors='None',edgecolors='b',marker='o',linewidth=1,zorder=3) a2.plot(x_new, bwf_result_mod.best_fit, color='g',label='fit 1') # new_outliers=np.argwhere(np.abs(bwf_result_mod.residual)>0.8) ## if you want to exclude points both above and below new_outliers=np.argwhere( bwf_result_mod.residual >0.8) x_new_2=np.delete(x_new,new_outliers) y_new_2=np.delete(y_new,new_outliers) a2.scatter(x_new[new_outliers],y_new[new_outliers],c='r',marker='x',s=10**2,linewidth=1,zorder=4,label='new outliers') a2.fill_between(x_new, bwf_result_mod.best_fit, bwf_result_mod.best_fit-0.8, color='r', hatch='///',alpha=0.2,zorder=1,label=u'fit - 0.8') pointsRemoved=len(y_new)-len(y_new_2) a2.text(1.05,0.5,u'↓{0} points removed'.format(pointsRemoved),ha='center',va='center',rotation=90,transform=a2.transAxes) #------------------ third plot ------------------ _orig=len(y) _remo=(len(y)-len(y_new_2)) _pct=_remo/(_orig/100.) a3.set_title(u'Result ({0} of {1} removed, ~{2:.0f}%)'.format(_orig,_remo,_pct )) x_final=np.delete(x_new,new_outliers) y_final=np.delete(y_new,new_outliers) ## store final point cloud in the df df.loc[x_final,'y_final']=y_final a3.scatter(x_final,y_final,facecolors='None',edgecolors='b',marker='o',linewidth=1,zorder=3) ## make final fit: bwf_result_final = bwf_model.fit(y_final, params, x=x_final) a3.scatter(x,y,facecolors='None',edgecolors='grey',marker='o',linewidth=.5,zorder=0,label='original data') a3.plot(x_final, bwf_result_final.best_fit, color='g',label='fit 2') ## now that we are"happy" with bwf_result_final, let's apply it on the df's"fine" (i.e. daily) index! y_fine=bwf_result_final.eval(x=idx) ## df['y_fine']=y_fine # store fit function df['y_joined']=df['y_final'] # store final point cloud df['y_joined'][df['y_final'].isnull()]=df['y_fine'] # join fit function points with final point cloud ####------------------ create second figure---------- fig2=plt.figure(figsize=(8,3),frameon=True,) gs2 = gridspec.GridSpec(1,1, left=0.08,right=0.95, bottom=0.15,top=0.9, wspace=0.1 ) ax2=plt.subplot(gs2[0]) ax2.scatter(df['date'],df['Values'],facecolors='None',edgecolors='grey',marker='o',linewidth=1,zorder=0,label='original data') ax2.plot(df['index'],df['y_fine'],c="g",zorder=3,label="final fit applied to all dates") ax2.plot(df['index'],df['y_joined'],color="r",marker=".",markersize=6,zorder=2,label="(points-outliers) +fit") # print(df.head(30)) for a in [a1,a2,a3,ax2]: a.set_ylim(-.5,7) a.legend() a1.set_ylabel('Value') ax2.set_ylabel('Value') for a in [a2,a3]: plt.setp(a.get_yticklabels(),visible=False) for a in [a1,a2,a3,ax2]: a.set_xlabel('Days from start') fig.savefig('outlier_removal.pdf') fig2.savefig('final_data.pdf') plt.show() |
试试这个:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | # first find the peaks # interpolate is important for find_peaks to work peaks = (find_peaks(df.set_index('date').interpolate() .reset_index().Values, rel_height=0.1)[0]) # copy the peaks' dates for easy manipulation peak_df = df.loc[peaks, ['date']].copy() # mark where the peak was too close to the last markers = (peak_df.date - peak_df.date.shift()).le(pd.Timedelta('14d')) # filter # df[markers.notnull() # where the peaks are # | (~markers.bfill().eq(False))] # those between the peaks that are far enough # as the above code gives an error markers = ((markers.notnull() | (~markers.bfill().eq(False)))==True).index df.loc[markers] |