How to generate directory size recursively in python, like du . does?
假设我的结构是这样的
1 2 3 4 5 | /-- am here /one/some/dir /two /three/has/many/leaves /hello/world |
say/one/some/dir包含一个500MB的大文件,/three/has/many/leaves在每个文件夹中包含一个400MB的文件。
我想为每个目录生成大小,以便输出
1 2 3 4 5 6 | / - in total for all /one/some/dir 500mb /two 0 /three/has/many/leaved - 400mb /three/has/many 800 /three/has/ 800+someotherbigfilehere |
我该怎么办?
看看
1 2 3 4 5 6 7 8 | import os from os.path import join, getsize for root, dirs, files in os.walk('python/Lib/email'): print root,"consumes", print sum(getsize(join(root, name)) for name in files), print"bytes in", len(files),"non-directory files" if 'CVS' in dirs: dirs.remove('CVS') # don't visit CVS directories |
为了您的目的,这应该足够容易修改。
以下是一个未经测试的版本,以回应您的评论:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | import os from os.path import join, getsize dirs_dict = {} #We need to walk the tree from the bottom up so that a directory can have easy # access to the size of its subdirectories. for root, dirs, files in os.walk('python/Lib/email',topdown = False): # Loop through every non directory file in this directory and sum their sizes size = sum(getsize(join(root, name)) for name in files) # Look at all of the subdirectories and add up their sizes from the `dirs_dict` subdir_size = sum(dirs_dict[join(root,d)] for d in dirs) # store the size of this directory (plus subdirectories) in a dict so we # can access it later my_size = dirs_dict[root] = size + subdir_size print '%s: %d'%(root,my_size) |
以下脚本打印指定目录的所有子目录的目录大小。这个脚本应该独立于平台posix/windows/etc。它还试图从缓存递归函数调用中获益(如果可能的话)。如果省略参数,脚本将在当前目录中工作。输出按目录大小从大到小排序。所以你可以根据自己的需要来调整它。
我用578019号配方来显示人性化的目录大小。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | from __future__ import print_function import os import sys import operator def null_decorator(ob): return ob if sys.version_info >= (3,2,0): import functools my_cache_decorator = functools.lru_cache(maxsize=4096) else: my_cache_decorator = null_decorator start_dir = os.path.normpath(os.path.abspath(sys.argv[1])) if len(sys.argv) > 1 else '.' @my_cache_decorator def get_dir_size(start_path = '.'): total_size = 0 if 'scandir' in dir(os): # using fast 'os.scandir' method (new in version 3.5) for entry in os.scandir(start_path): if entry.is_dir(follow_symlinks = False): total_size += get_dir_size(entry.path) elif entry.is_file(follow_symlinks = False): total_size += entry.stat().st_size else: # using slow, but compatible 'os.listdir' method for entry in os.listdir(start_path): full_path = os.path.abspath(os.path.join(start_path, entry)) if os.path.islink(full_path): continue if os.path.isdir(full_path): total_size += get_dir_size(full_path) elif os.path.isfile(full_path): total_size += os.path.getsize(full_path) return total_size def get_dir_size_walk(start_path = '.'): total_size = 0 for dirpath, dirnames, filenames in os.walk(start_path): for f in filenames: fp = os.path.join(dirpath, f) total_size += os.path.getsize(fp) return total_size def bytes2human(n, format='%(value).0f%(symbol)s', symbols='customary'): """ (c) http://code.activestate.com/recipes/578019/ Convert n bytes into a human readable string based on format. symbols can be either"customary","customary_ext","iec" or"iec_ext", see: https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000 >>> bytes2human(0) '0.0 B' >>> bytes2human(0.9) '0.0 B' >>> bytes2human(1) '1.0 B' >>> bytes2human(1.9) '1.0 B' >>> bytes2human(1024) '1.0 K' >>> bytes2human(1048576) '1.0 M' >>> bytes2human(1099511627776127398123789121) '909.5 Y' >>> bytes2human(9856, symbols="customary") '9.6 K' >>> bytes2human(9856, symbols="customary_ext") '9.6 kilo' >>> bytes2human(9856, symbols="iec") '9.6 Ki' >>> bytes2human(9856, symbols="iec_ext") '9.6 kibi' >>> bytes2human(10000,"%(value).1f %(symbol)s/sec") '9.8 K/sec' >>> # precision can be adjusted by playing with %f operator >>> bytes2human(10000, format="%(value).5f %(symbol)s") '9.76562 K' """ SYMBOLS = { 'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'), 'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'), 'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), 'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'), } n = int(n) if n < 0: raise ValueError("n < 0") symbols = SYMBOLS[symbols] prefix = {} for i, s in enumerate(symbols[1:]): prefix[s] = 1 << (i+1)*10 for symbol in reversed(symbols[1:]): if n >= prefix[symbol]: value = float(n) / prefix[symbol] return format % locals() return format % dict(symbol=symbols[0], value=n) ############################################################ ### ### main () ### ############################################################ if __name__ == '__main__': dir_tree = {} ### version, that uses 'slow' [os.walk method] #get_size = get_dir_size_walk ### this recursive version can benefit from caching the function calls (functools.lru_cache) get_size = get_dir_size for root, dirs, files in os.walk(start_dir): for d in dirs: dir_path = os.path.join(root, d) if os.path.isdir(dir_path): dir_tree[dir_path] = get_size(dir_path) for d, size in sorted(dir_tree.items(), key=operator.itemgetter(1), reverse=True): print('%s\t%s' %(bytes2human(size, format='%(value).2f%(symbol)s'), d)) print('-' * 80) if sys.version_info >= (3,2,0): print(get_dir_size.cache_info()) |
样品输出:
1 2 3 4 5 6 | 37.61M .\subdir_b 2.18M .\subdir_a 2.17M .\subdir_a\subdir_a_2 4.41K .\subdir_a\subdir_a_1 ---------------------------------------------------------- CacheInfo(hits=2, misses=4, maxsize=4096, currsize=4) |
我通过以下代码实现了这一点:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | def get_dir_size(path=os.getcwd()): total_size = 0 for dirpath, dirnames, filenames in os.walk(path): dirsize = 0 for f in filenames: fp = os.path.join(dirpath, f) size = os.path.getsize(fp) #print('\t',size, f) #print(dirpath, dirnames, filenames,size) dirsize += size total_size += size print('\t',dirsize, dirpath) print(" {0:.2f} Kb".format(total_size/1024)) |
实际上,如果目录中存在符号链接,@mgilson answer将不起作用。为了允许你必须这样做:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | dirs_dict = {} for root, dirs, files in os.walk(directory, topdown=False): if os.path.islink(root): dirs_dict[root] = 0L else: dir_size = getsize(root) # Loop through every non directory file in this directory and sum their sizes for name in files: full_name = join(root, name) if os.path.islink(full_name): nsize = 0L else: nsize = getsize(full_name) dirs_dict[full_name] = nsize dir_size += nsize # Look at all of the subdirectories and add up their sizes from the `dirs_dict` subdir_size = 0L for d in dirs: full_d = join(root, d) if os.path.islink(full_d): dirs_dict[full_d] = 0L else: subdir_size += dirs_dict[full_d] dirs_dict[root] = dir_size + subdir_size |