在python 2.7中使用pickle.dump保存的类变量字典

Class variable dictionary not saving with pickle.dump in python 2.7

我正在使用pickle通过转储根来保存对象图。当我加载根目录时,它有所有的实例变量和连接的对象节点。不过,我将把所有节点保存在类型为dictionary的类变量中。类变量在保存前已满,但在取消拾取数据后,它为空。

这是我正在使用的类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class Page():

    __crawled = {}

    def __init__(self, title = '', link = '', relatedURLs = []):
        self.__title = title
        self.__link = link
        self.__relatedURLs = relatedURLs
        self.__related = []

    @property
    def relatedURLs(self):
        return self.__relatedURLs

    @property
    def title(self):
        return self.__title

    @property
    def related(self):
        return self.__related

    @property
    def crawled(self):
        return self.__crawled

    def crawl(self,url):
        if url not in self.__crawled:
            webpage = urlopen(url).read()
            patFinderTitle = re.compile('(.*)')
            patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
            patFinderRelated = re.compile('
<li>
<a href="([^"]*)"'
)

            findPatTitle = re.findall(patFinderTitle, webpage)
            findPatLink = re.findall(patFinderLink, webpage)
            findPatRelated = re.findall(patFinderRelated, webpage)
            newPage = Page(findPatTitle,findPatLink,findPatRelated)
            self.__related.append(newPage)
            self.__crawled[url] = newPage
        else:
            self.__related.append(self.__crawled[url])

    def crawlRelated(self):
        for link in self.__relatedURLs:
            self.crawl(link)

我是这样保存的:

1
2
with open('medTwiceGraph.dat','w') as outf:
    pickle.dump(root,outf)

我像这样加载它:

1
2
3
4
5
def loadGraph(filename): #returns root
    with open(filename,'r') as inf:
        return pickle.load(inf)

root = loadGraph('medTwiceGraph.dat')

除已爬网的类变量外,所有数据都将加载。

我做错什么了?


python并没有真正的pickle类对象。它只保存他们的名字和在哪里找到他们。根据pickle文件:

Similarly, classes are pickled by named reference, so the same
restrictions in the unpickling environment apply. Note that none of
the class’s code or data is pickled, so in the following example the
class attribute attr is not restored in the unpickling environment:

1
2
3
4
class Foo:
    attr = 'a class attr'

picklestring = pickle.dumps(Foo)

These restrictions are why picklable functions and classes must be
defined in the top level of a module.

Similarly, when class instances are pickled, their class’s code and
data are not pickled along with them. Only the instance data are
pickled. This is done on purpose, so you can fix bugs in a class or
add methods to the class and still load objects that were created with
an earlier version of the class. If you plan to have long-lived
objects that will see many versions of a class, it may be worthwhile
to put a version number in the objects so that suitable conversions
can be made by the class’s __setstate__() method.

在您的示例中,您可以解决将__crawled更改为实例属性或全局变量的问题。


默认情况下,pickle将只使用self.__dict__的内容,而不使用self.__class__.__dict__,这是您认为您想要的。

我说,"你认为你想要什么",因为解压一个实例不应该改变类级的状态。

如果你想改变这种行为,那么看看文档中的__getstate____setstate__


对于任何感兴趣的人,我所做的就是创建一个包含实例变量的超类图,并将我的爬行函数移动到图中。页面现在只包含描述页面及其相关页面的属性。我pickle我的图形实例,其中包含我的所有页面实例。这是我的密码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from urllib import urlopen
#from bs4 import BeautifulSoup
import re
import pickle

###################CLASS GRAPH####################
class Graph(object):
    def __init__(self,roots = [],crawled = {}):
        self.__roots = roots
        self.__crawled = crawled
    @property
    def roots(self):
        return self.__roots
    @property
    def crawled(self):
        return self.__crawled
    def crawl(self,page,url):
        if url not in self.__crawled:
            webpage = urlopen(url).read()
            patFinderTitle = re.compile('(.*)')
            patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
            patFinderRelated = re.compile('
<li>
<a href="([^"]*)"'
)

            findPatTitle = re.findall(patFinderTitle, webpage)
            findPatLink = re.findall(patFinderLink, webpage)
            findPatRelated = re.findall(patFinderRelated, webpage)
            newPage = Page(findPatTitle,findPatLink,findPatRelated)
            page.related.append(newPage)
            self.__crawled[url] = newPage
        else:
            page.related.append(self.__crawled[url])

    def crawlRelated(self,page):
        for link in page.relatedURLs:
            self.crawl(page,link)
    def crawlAll(self,obj,limit = 2,i = 0):
        print 'number of crawled pages:', len(self.crawled)
        i += 1
        if i > limit:
            return
        else:
            for rel in obj.related:
                print 'crawling', rel.title
                self.crawlRelated(rel)
            for rel2 in obj.related:
                self.crawlAll(rel2,limit,i)          
    def loadGraph(self,filename):
        with open(filename,'r') as inf:
            return pickle.load(inf)
    def saveGraph(self,obj,filename):
        with open(filename,'w') as outf:
            pickle.dump(obj,outf)
###################CLASS PAGE#####################
class Page(Graph):
    def __init__(self, title = '', link = '', relatedURLs = []):
        self.__title = title
        self.__link = link
        self.__relatedURLs = relatedURLs
        self.__related = []      
    @property
    def relatedURLs(self):
        return self.__relatedURLs
    @property
    def title(self):
        return self.__title
    @property
    def related(self):
        return self.__related
####################### MAIN ######################
def main(seed):
    print 'doing some work...'
    webpage = urlopen(seed).read()

    patFinderTitle = re.compile('(.*)')
    patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
    patFinderRelated = re.compile('
<li>
<a href="([^"]*)"'
)

    findPatTitle = re.findall(patFinderTitle, webpage)
    findPatLink = re.findall(patFinderLink, webpage)
    findPatRelated = re.findall(patFinderRelated, webpage)

    print 'found the webpage', findPatTitle

    #root = Page(findPatTitle,findPatLink,findPatRelated)
    G = Graph([Page(findPatTitle,findPatLink,findPatRelated)])
    print 'crawling related...'
    G.crawlRelated(G.roots[0])
    G.crawlAll(G.roots[0])  
    print 'now saving...'
    G.saveGraph(G, 'medTwiceGraph.dat')
    print 'done'
    return G
#####################END MAIN######################

#'http://medtwice.com/am-i-pregnant/'
#'medTwiceGraph.dat'

#G = main('http://medtwice.com/menopause-overview/')
#print G.crawled


def loadGraph(filename):
    with open(filename,'r') as inf:
        return pickle.load(inf)

G = loadGraph('MedTwiceGraph.dat')
print G.roots[0].title
print G.roots[0].related
print G.crawled

for key in G.crawled:
    print G.crawled[key].title

使用dill可以解决这个问题。dill包:https://pypi.python.org/pypi/dill参考:https://stackoverflow.com/a/28543378/6301132

根据asker的代码,输入:

1
2
3
4
5
6
7
8
9
10
11
#notice:open the file in binary require

#save
with open('medTwiceGraph.dat','wb') as outf:
    dill.dump(root,outf)
#load
def loadGraph(filename): #returns root
    with open(filename,'rb') as inf:
        return dill.load(inf)

root = loadGraph('medTwiceGraph.dat')

我写了另一个例子:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#Another example (with Python 3.x)

import dill
import os

class Employee:

    def __init__ (self ,name='',contact={}) :
        self.name = name
        self.contact = contact

    def print_self(self):
        print(self.name, self.contact)

#save
def save_employees():
    global emp
    with open('employees.dat','wb') as fh:
        dill.dump(emp,fh)

#load
def load_employees():
    global emp
    if os.path.exists('employees.dat'):
        with open('employees.dat','rb') as fh:
            emp=dill.load(fh)

#---
emp=[]
load_employees()
print('loaded:')
for tmpe in emp:
    tmpe.print_self()

e=Employee() #new employee
if len(emp)==0:
    e.name='Jack'
    e.contact={'phone':'+086-12345678'}
elif len(emp)==1:
    e.name='Jane'
    e.contact={'phone':'+01-15555555','email':'[email protected]'}
else:
    e.name='sb.'
    e.contact={'telegram':'x'}
emp.append(e)

save_employees()