How to load an excel sheet and clean the data in python?
从文件energy indicators.xls中加载能源数据,该文件是联合国2013年能源供应和可再生电力生产指标的列表,并应放入具有可变能源名称的数据框中。
接下来,从文件worldu bank.csv中加载gdp数据,该文件是一个csv文件,其中包含世界银行1960年至2015年各国的gdp。称之为数据框架GDP。请确保跳过标题,并重命名以下国家/地区列表:"韩国,代表":"韩国","伊朗,伊斯兰共和国":"伊朗","香港特区,中国":"香港"
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | import pandas as pd from pandas import ExcelWriter from pandas import ExcelFile pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) Energy = pd.read_excel('Energy Indicators.xls') Energy.drop(Energy.columns[[0,1]],axis=1,inplace=True) Energy.columns=['Country','Energy Supply','Energy Supply per capita','% Renewable'] Energy['Energy Supply']*=1000000 Energy['Country'] = Energy['Country'].str.replace(r"\(.*\)","") Energy['Country'] = Energy['Country'].str.replace("[0-9()]+$","") Energy.replace('Republic of Korea','South Korea', inplace = True) Energy.replace('United States of America','United States', inplace = True) Energy.replace('United Kingdom of Great Britain and Northern Ireland','United Kingdom', inplace = True) Energy.replace('China, Hong Kong Special Administrative Region','Hong Kong', inplace = True) import pandas as pd GDP = pd.read_csv('world_bank.csv', index_col=0, header=None) GDP = GDP.drop(['Data Source']) GDP = GDP.dropna() GDP = GDP.reset_index() GDP.columns = GDP.iloc[0] GDP.drop(GDP.index[[0,3]], inplace=True) GDP = GDP.rename(columns={'Country Name': 'Country'}) GDP.replace(',','-', inplace=True) GDP = GDP.replace('Korea, Rep.','South Korea') GDP = GDP.replace('Iran, Islamic Rep.','Iran') GDP = GDP.replace('Hong Kong SAR, China','Hong Kong') import pandas as pd from pandas import ExcelWriter from pandas import ExcelFile pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) ScimEn = pd.read_excel('scimagojr-3.xlsx') b = pd.merge(pd.merge(Energy,GDP,on='Country'),ScimEn,on='Country') a = pd.merge(pd.merge(Energy,GDP,on='Country'),ScimEn,on='Country') a = a.sort(['Rank'], ascending=[True]) a = a[a["Rank"] < 16] a=a.rename(columns = {'2006.0':'abc'}) a.columns.values[53] ="2006" a.columns.values[54] ="2007" a.columns.values[55] ="2008" a.columns.values[56] ="2009" a.columns.values[57] ="2010" a.columns.values[58] ="2011" a.columns.values[59] ="2012" a.columns.values[60] ="2013" a.columns.values[61] ="2014" a.columns.values[62] ="2015" a = a[['Country','Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per capita', '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']] a = a.set_index('Country') def ans(): return a ans() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | import numpy as np import pandas as pd def energy(): energy=pd.ExcelFile('Energy Indicators.xls').parse('Energy') energy=energy.iloc[16:243][['Environmental Indicators: Energy','Unnamed: 3','Unnamed: 4','Unnamed: 5']].copy() energy.columns=['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable'] energy = energy.replace('...', np.nan) energy['Energy Supply']=energy['Energy Supply']*1000000 energy = energy.replace("Republic of Korea","South Korea") energy = energy.replace("United States of America","United States") energy = energy.replace("United Kingdom of Great Britain and Northern Ireland","United Kingdom") energy = energy.replace("China, Hong Kong Special Administrative Region","Hong Kong") energy['Country'] = energy['Country'].str.extract('(^[a-zA-Z\s]+)', expand=False).str.strip() energy=energy.reset_index() energy=energy[['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']] return energy.iloc[43] def GDP(): GDP=pd.read_csv('world_bank.csv') s=(GDP.iloc[3].values)[:4].astype(str).tolist()+(GDP.iloc[3].values)[4:].astype(int).astype(str).tolist() GDP=GDP.iloc[4:] GDP.columns=s GDP=GDP[['Country Name','2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']] GDP.columns=['Country','2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'] GDP=GDP.replace("Korea, Rep.","South Korea",regex=False) GDP=GDP.replace("Iran, Islamic Rep.","Iran") GDP=GDP.replace("Hong Kong SAR, China","Hong Kong",regex=False) return GDP def ScimEn(): ScimEn=pd.ExcelFile('scimagojr-3.xlsx').parse('Sheet1') return ScimEn def result(): e= energy() G=GDP() S=ScimEn() tdf=pd.merge(e,G,on='Country') tdf=pd.merge(tdf,S,on='Country') res = tdf.sort_values(by=['Rank'], inplace = True) res = tdf.head(15) res=res.set_index('Country', inplace=False) return res |
1 | en = ex.parse('sheetname', skiprows = 2, skip_footer =True,..) |
1 | en.replace('$%^',np.NaN, inplace =True) |
1 | en.drop([dol for col in ['colname1', 'colname2', ...] if col in en], axis =1, inplace =True) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | def answer_one(): import pandas as pd energy=pd.read_excel('Energy Indicators.xls', skiprows=2) energy.columns=['a','b','Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable'] del energy['a'] del energy['b'] energy['Energy Supply']*=1000000 energy['Country'] = energy['Country'].str.replace(r"\(.*\)","") energy['Country'] = energy['Country'].str.replace("[0-9()]+$","") energy.replace('Republic of Korea','South Korea', inplace = True) energy.replace('United States of America','United States', inplace = True) energy.replace('United Kingdom of Great Britain and Northern Ireland','United Kingdom', inplace = True) energy.replace('China, Hong Kong Special Administrative Region','Hong Kong', inplace = True) GDP=pd.read_csv('world_bank.csv',skiprows=4) GDP.replace('Korea, Rep.','South Korea') GDP.replace('Iran, Islamic Rep.','Iran') GDP.replace('Hong Kong SAR, China' , 'Hong Kong') ScimEn=pd.read_excel('scimagojr-3.xlsx') GDP.columns=['Country', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'] for i in ['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005']: del GDP[i] ScimEn=ScimEn[ScimEn['Rank']<16] x=pd.merge(GDP,ScimEn,how='inner',left_on='Country',right_on='Country') y=pd.merge(x,energy,how='inner',left_on='Country',right_on='Country') y=y.set_index('Country') del y['Country Code'] del y['Indicator Name'] del y['Indicator Code'] return y answer_one() |