Class variable dictionary not saving with pickle.dump in python 2.7
我正在使用pickle通过转储根来保存对象图。当我加载根目录时,它有所有的实例变量和连接的对象节点。不过,我将把所有节点保存在类型为dictionary的类变量中。类变量在保存前已满,但在取消拾取数据后,它为空。
这是我正在使用的类:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | class Page(): __crawled = {} def __init__(self, title = '', link = '', relatedURLs = []): self.__title = title self.__link = link self.__relatedURLs = relatedURLs self.__related = [] @property def relatedURLs(self): return self.__relatedURLs @property def title(self): return self.__title @property def related(self): return self.__related @property def crawled(self): return self.__crawled def crawl(self,url): if url not in self.__crawled: webpage = urlopen(url).read() patFinderTitle = re.compile('(.*)') patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />') patFinderRelated = re.compile(' <li> <a href="([^"]*)"') findPatTitle = re.findall(patFinderTitle, webpage) findPatLink = re.findall(patFinderLink, webpage) findPatRelated = re.findall(patFinderRelated, webpage) newPage = Page(findPatTitle,findPatLink,findPatRelated) self.__related.append(newPage) self.__crawled[url] = newPage else: self.__related.append(self.__crawled[url]) def crawlRelated(self): for link in self.__relatedURLs: self.crawl(link) |
我是这样保存的:
1 2 | with open('medTwiceGraph.dat','w') as outf: pickle.dump(root,outf) |
我像这样加载它:
1 2 3 4 5 | def loadGraph(filename): #returns root with open(filename,'r') as inf: return pickle.load(inf) root = loadGraph('medTwiceGraph.dat') |
除已爬网的类变量外,所有数据都将加载。
我做错什么了?
python并没有真正的pickle类对象。它只保存他们的名字和在哪里找到他们。根据
Similarly, classes are pickled by named reference, so the same
restrictions in the unpickling environment apply. Note that none of
the class’s code or data is pickled, so in the following example the
class attributeattr is not restored in the unpickling environment:
1
2
3
4 class Foo:
attr = 'a class attr'
picklestring = pickle.dumps(Foo)These restrictions are why picklable functions and classes must be
defined in the top level of a module.Similarly, when class instances are pickled, their class’s code and
data are not pickled along with them. Only the instance data are
pickled. This is done on purpose, so you can fix bugs in a class or
add methods to the class and still load objects that were created with
an earlier version of the class. If you plan to have long-lived
objects that will see many versions of a class, it may be worthwhile
to put a version number in the objects so that suitable conversions
can be made by the class’s__setstate__() method.
在您的示例中,您可以解决将
默认情况下,pickle将只使用
我说,"你认为你想要什么",因为解压一个实例不应该改变类级的状态。
如果你想改变这种行为,那么看看文档中的
对于任何感兴趣的人,我所做的就是创建一个包含实例变量的超类图,并将我的爬行函数移动到图中。页面现在只包含描述页面及其相关页面的属性。我pickle我的图形实例,其中包含我的所有页面实例。这是我的密码。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | from urllib import urlopen #from bs4 import BeautifulSoup import re import pickle ###################CLASS GRAPH#################### class Graph(object): def __init__(self,roots = [],crawled = {}): self.__roots = roots self.__crawled = crawled @property def roots(self): return self.__roots @property def crawled(self): return self.__crawled def crawl(self,page,url): if url not in self.__crawled: webpage = urlopen(url).read() patFinderTitle = re.compile('(.*)') patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />') patFinderRelated = re.compile(' <li> <a href="([^"]*)"') findPatTitle = re.findall(patFinderTitle, webpage) findPatLink = re.findall(patFinderLink, webpage) findPatRelated = re.findall(patFinderRelated, webpage) newPage = Page(findPatTitle,findPatLink,findPatRelated) page.related.append(newPage) self.__crawled[url] = newPage else: page.related.append(self.__crawled[url]) def crawlRelated(self,page): for link in page.relatedURLs: self.crawl(page,link) def crawlAll(self,obj,limit = 2,i = 0): print 'number of crawled pages:', len(self.crawled) i += 1 if i > limit: return else: for rel in obj.related: print 'crawling', rel.title self.crawlRelated(rel) for rel2 in obj.related: self.crawlAll(rel2,limit,i) def loadGraph(self,filename): with open(filename,'r') as inf: return pickle.load(inf) def saveGraph(self,obj,filename): with open(filename,'w') as outf: pickle.dump(obj,outf) ###################CLASS PAGE##################### class Page(Graph): def __init__(self, title = '', link = '', relatedURLs = []): self.__title = title self.__link = link self.__relatedURLs = relatedURLs self.__related = [] @property def relatedURLs(self): return self.__relatedURLs @property def title(self): return self.__title @property def related(self): return self.__related ####################### MAIN ###################### def main(seed): print 'doing some work...' webpage = urlopen(seed).read() patFinderTitle = re.compile('(.*)') patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />') patFinderRelated = re.compile(' <li> <a href="([^"]*)"') findPatTitle = re.findall(patFinderTitle, webpage) findPatLink = re.findall(patFinderLink, webpage) findPatRelated = re.findall(patFinderRelated, webpage) print 'found the webpage', findPatTitle #root = Page(findPatTitle,findPatLink,findPatRelated) G = Graph([Page(findPatTitle,findPatLink,findPatRelated)]) print 'crawling related...' G.crawlRelated(G.roots[0]) G.crawlAll(G.roots[0]) print 'now saving...' G.saveGraph(G, 'medTwiceGraph.dat') print 'done' return G #####################END MAIN###################### #'http://medtwice.com/am-i-pregnant/' #'medTwiceGraph.dat' #G = main('http://medtwice.com/menopause-overview/') #print G.crawled def loadGraph(filename): with open(filename,'r') as inf: return pickle.load(inf) G = loadGraph('MedTwiceGraph.dat') print G.roots[0].title print G.roots[0].related print G.crawled for key in G.crawled: print G.crawled[key].title |
使用
根据asker的代码,输入:
1 2 3 4 5 6 7 8 9 10 11 | #notice:open the file in binary require #save with open('medTwiceGraph.dat','wb') as outf: dill.dump(root,outf) #load def loadGraph(filename): #returns root with open(filename,'rb') as inf: return dill.load(inf) root = loadGraph('medTwiceGraph.dat') |
我写了另一个例子:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | #Another example (with Python 3.x) import dill import os class Employee: def __init__ (self ,name='',contact={}) : self.name = name self.contact = contact def print_self(self): print(self.name, self.contact) #save def save_employees(): global emp with open('employees.dat','wb') as fh: dill.dump(emp,fh) #load def load_employees(): global emp if os.path.exists('employees.dat'): with open('employees.dat','rb') as fh: emp=dill.load(fh) #--- emp=[] load_employees() print('loaded:') for tmpe in emp: tmpe.print_self() e=Employee() #new employee if len(emp)==0: e.name='Jack' e.contact={'phone':'+086-12345678'} elif len(emp)==1: e.name='Jane' e.contact={'phone':'+01-15555555','email':'[email protected]'} else: e.name='sb.' e.contact={'telegram':'x'} emp.append(e) save_employees() |