关于文件系统:如何在Python中递归地生成目录大小,比如du .

How to generate directory size recursively in python, like du . does?

假设我的结构是这样的

1
2
3
4
5
/-- am here
/one/some/dir
/two
/three/has/many/leaves
/hello/world

say/one/some/dir包含一个500MB的大文件,/three/has/many/leaves在每个文件夹中包含一个400MB的文件。

我想为每个目录生成大小,以便输出

1
2
3
4
5
6
/ - in total for all
/one/some/dir 500mb
/two 0
/three/has/many/leaved - 400mb
/three/has/many 800
/three/has/ 800+someotherbigfilehere

我该怎么办?


看看os.walk。具体来说,文档中有一个查找目录大小的示例:

1
2
3
4
5
6
7
8
import os
from os.path import join, getsize
for root, dirs, files in os.walk('python/Lib/email'):
    print root,"consumes",
    print sum(getsize(join(root, name)) for name in files),
    print"bytes in", len(files),"non-directory files"
    if 'CVS' in dirs:
        dirs.remove('CVS')  # don't visit CVS directories

为了您的目的,这应该足够容易修改。

以下是一个未经测试的版本,以回应您的评论:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import os
from os.path import join, getsize
dirs_dict = {}

#We need to walk the tree from the bottom up so that a directory can have easy
# access to the size of its subdirectories.
for root, dirs, files in os.walk('python/Lib/email',topdown = False):

    # Loop through every non directory file in this directory and sum their sizes
    size = sum(getsize(join(root, name)) for name in files)

    # Look at all of the subdirectories and add up their sizes from the `dirs_dict`
    subdir_size = sum(dirs_dict[join(root,d)] for d in dirs)

    # store the size of this directory (plus subdirectories) in a dict so we
    # can access it later
    my_size = dirs_dict[root] = size + subdir_size

    print '%s: %d'%(root,my_size)


以下脚本打印指定目录的所有子目录的目录大小。这个脚本应该独立于平台posix/windows/etc。它还试图从缓存递归函数调用中获益(如果可能的话)。如果省略参数,脚本将在当前目录中工作。输出按目录大小从大到小排序。所以你可以根据自己的需要来调整它。

我用578019号配方来显示人性化的目录大小。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from __future__ import print_function
import os
import sys
import operator

def null_decorator(ob):
    return ob

if sys.version_info >= (3,2,0):
    import functools
    my_cache_decorator = functools.lru_cache(maxsize=4096)
else:
    my_cache_decorator = null_decorator

start_dir = os.path.normpath(os.path.abspath(sys.argv[1])) if len(sys.argv) > 1 else '.'

@my_cache_decorator
def get_dir_size(start_path = '.'):
    total_size = 0
    if 'scandir' in dir(os):
        # using fast 'os.scandir' method (new in version 3.5)
        for entry in os.scandir(start_path):
            if entry.is_dir(follow_symlinks = False):
                total_size += get_dir_size(entry.path)
            elif entry.is_file(follow_symlinks = False):
                total_size += entry.stat().st_size
    else:
        # using slow, but compatible 'os.listdir' method
        for entry in os.listdir(start_path):
            full_path = os.path.abspath(os.path.join(start_path, entry))
            if os.path.islink(full_path):
                continue
            if os.path.isdir(full_path):
                total_size += get_dir_size(full_path)
            elif os.path.isfile(full_path):
                total_size += os.path.getsize(full_path)
    return total_size

def get_dir_size_walk(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

def bytes2human(n, format='%(value).0f%(symbol)s', symbols='customary'):
   """
    (c) http://code.activestate.com/recipes/578019/

    Convert n bytes into a human readable string based on format.
    symbols can be either"customary","customary_ext","iec" or"iec_ext",
    see: https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000

      >>> bytes2human(0)
      '0.0 B'
      >>> bytes2human(0.9)
      '0.0 B'
      >>> bytes2human(1)
      '1.0 B'
      >>> bytes2human(1.9)
      '1.0 B'
      >>> bytes2human(1024)
      '1.0 K'
      >>> bytes2human(1048576)
      '1.0 M'
      >>> bytes2human(1099511627776127398123789121)
      '909.5 Y'

      >>> bytes2human(9856, symbols="customary")
      '9.6 K'
      >>> bytes2human(9856, symbols="customary_ext")
      '9.6 kilo'
      >>> bytes2human(9856, symbols="iec")
      '9.6 Ki'
      >>> bytes2human(9856, symbols="iec_ext")
      '9.6 kibi'

      >>> bytes2human(10000,"%(value).1f %(symbol)s/sec")
      '9.8 K/sec'

      >>> # precision can be adjusted by playing with %f operator
      >>> bytes2human(10000, format="%(value).5f %(symbol)s")
      '9.76562 K'
   """

    SYMBOLS = {
        'customary'     : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
        'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
                           'zetta', 'iotta'),
        'iec'           : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
        'iec_ext'       : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
                           'zebi', 'yobi'),
    }
    n = int(n)
    if n < 0:
        raise ValueError("n < 0")
    symbols = SYMBOLS[symbols]
    prefix = {}
    for i, s in enumerate(symbols[1:]):
        prefix[s] = 1 << (i+1)*10
    for symbol in reversed(symbols[1:]):
        if n >= prefix[symbol]:
            value = float(n) / prefix[symbol]
            return format % locals()
    return format % dict(symbol=symbols[0], value=n)

############################################################
###
###  main ()
###
############################################################
if __name__ == '__main__':
    dir_tree = {}
    ### version, that uses 'slow' [os.walk method]
    #get_size = get_dir_size_walk
    ### this recursive version can benefit from caching the function calls (functools.lru_cache)
    get_size = get_dir_size

    for root, dirs, files in os.walk(start_dir):
        for d in dirs:
            dir_path = os.path.join(root, d)
            if os.path.isdir(dir_path):
                dir_tree[dir_path] = get_size(dir_path)

    for d, size in sorted(dir_tree.items(), key=operator.itemgetter(1), reverse=True):
        print('%s\t%s' %(bytes2human(size, format='%(value).2f%(symbol)s'), d))

    print('-' * 80)
    if sys.version_info >= (3,2,0):
        print(get_dir_size.cache_info())

样品输出:

1
2
3
4
5
6
37.61M  .\subdir_b
2.18M   .\subdir_a
2.17M   .\subdir_a\subdir_a_2
4.41K   .\subdir_a\subdir_a_1
----------------------------------------------------------
CacheInfo(hits=2, misses=4, maxsize=4096, currsize=4)


我通过以下代码实现了这一点:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def get_dir_size(path=os.getcwd()):

    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):

        dirsize = 0
        for f in filenames:
            fp = os.path.join(dirpath, f)
            size = os.path.getsize(fp)
            #print('\t',size, f)
            #print(dirpath, dirnames, filenames,size)
            dirsize += size
            total_size += size
        print('\t',dirsize, dirpath)
    print(" {0:.2f} Kb".format(total_size/1024))

实际上,如果目录中存在符号链接,@mgilson answer将不起作用。为了允许你必须这样做:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
dirs_dict = {}
for root, dirs, files in os.walk(directory, topdown=False):
    if os.path.islink(root):
        dirs_dict[root] = 0L
    else:
        dir_size = getsize(root)

        # Loop through every non directory file in this directory and sum their sizes
        for name in files:
             full_name = join(root, name)
             if os.path.islink(full_name):
                 nsize = 0L
             else:
                 nsize = getsize(full_name)
             dirs_dict[full_name] = nsize
             dir_size += nsize

        # Look at all of the subdirectories and add up their sizes from the `dirs_dict`
        subdir_size = 0L
        for d in dirs:
            full_d = join(root, d)
            if os.path.islink(full_d):
                dirs_dict[full_d] = 0L
            else:
                subdir_size += dirs_dict[full_d]

        dirs_dict[root] = dir_size + subdir_size