Node.js & Amazon S3: How to iterate through all files in a bucket?
是否有任何适用于Node.js的Amazon S3客户端库,允许列出S3存储桶中的所有文件?
最著名的aws2js和knox似乎没有此功能。
使用官方的aws-sdk:
1 2 3 4 5 6 7 8 9 10 11 12 | var allKeys = []; function listAllKeys(marker, cb) { s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){ allKeys.push(data.Contents); if(data.IsTruncated) listAllKeys(data.NextMarker, cb); else cb(); }); } |
参见s3.listObjects
编辑2017:
基本思想相同,但现在建议使用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | var allKeys = []; function listAllKeys(token, cb) { var opts = { Bucket: s3bucket }; if(token) opts.ContinuationToken = token; s3.listObjectsV2(opts, function(err, data){ allKeys = allKeys.concat(data.Contents); if(data.IsTruncated) listAllKeys(data.NextContinuationToken, cb); else cb(); }); } |
这是我编写的从截断列表组装S3对象的Node代码。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | var params = { Bucket: <yourbucket>, Prefix: <yourprefix>, }; var s3DataContents = []; // Single array of all combined S3 data.Contents function s3Print() { if (program.al) { // --al: Print all objects console.log(JSON.stringify(s3DataContents, null," ")); } else { // --b: Print key only, otherwise also print index var i; for (i = 0; i < s3DataContents.length; i++) { var head = !program.b ? (i+1) + ': ' : ''; console.log(head + s3DataContents[i].Key); } } } function s3ListObjects(params, cb) { s3.listObjects(params, function(err, data) { if (err) { console.log("listS3Objects Error:", err); } else { var contents = data.Contents; s3DataContents = s3DataContents.concat(contents); if (data.IsTruncated) { // Set Marker to last returned key params.Marker = contents[contents.length-1].Key; s3ListObjects(params, cb); } else { cb(); } } }); } s3ListObjects(params, s3Print); |
请注意NextMarker的listObject文档,该文档并不总是存在于返回的数据对象中,因此在上面的代码中我根本不使用它。
NextMarker — (String) When response is truncated (the IsTruncated
element value in the response is true), you can use the key name in
this field as marker in the subsequent request to get next set of
objects. Amazon S3 lists objects in alphabetical order Note: This
element is returned only if you have delimiter request parameter
specified. If response does not include the NextMarker and it is
truncated, you can use the value of the last Key in the response as
the marker in the subsequent request to get the next set of object
keys.
整个程序现已推送到https://github.com/kenklin/s3list。
实际上,aws2js支持通过
1 2 3 4 5 6 7 8 9 10 | var s3 = require('aws2js').load('s3', awsAccessKeyId, awsSecretAccessKey); s3.setBucket(bucketName); var folder = encodeURI('some/path/to/S3/folder'); var url = '?prefix=' + folder; s3.get(url, 'xml', function (error, data) { console.log(error); console.log(data); }); |
上面片段中的
当我找不到一个好的现有解决方案时,发布了knox副本。将Rest API的所有分页详细信息包装到一个熟悉的节点流中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | var knoxCopy = require('knox-copy'); var client = knoxCopy.createClient({ key: '', secret: '<secret-here>', bucket: 'mrbucket' }); client.streamKeys({ // omit the prefix to list the whole bucket prefix: 'buckets/of/fun' }).on('data', function(key) { console.log(key); }); |
如果您列出的文件少于1000个,则可以使用一个页面:
1 2 3 4 5 | client.listPageOfKeys({ prefix: 'smaller/bucket/o/fun' }, function(err, page) { console.log(page.Contents); // <- Here's your list of files }); |
Meekohi提供了一个很好的答案,但是(新)文档指出,NextMarker可能是未定义的。在这种情况下,应使用最后一个键作为标记。
因此,他的代码示例可以更改为:
1 2 3 4 5 6 7 8 9 10 | var allKeys = []; function listAllKeys(marker, cb) { s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){ allKeys.push(data.Contents); if(data.IsTruncated) listAllKeys(data.NextMarker || data.Contents[data.Contents.length-1].Key, cb); else cb(); }); } |
由于我没有所需的声誉,因此无法评论原始答案。不好意思,顺便说一句。
这是一个古老的问题,我猜自问起以来,AWS JS SDK发生了很大变化。最近,这是另一种方法:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | s3.listObjects({Bucket:'mybucket', Prefix:'some-pfx'}). on('success', function handlePage(r) { //... handle page of contents r.data.Contents if(r.hasNextPage()) { // There's another page; handle it r.nextPage().on('success', handlePage).send(); } else { // Finished! } }). on('error', function(r) { // Error! }). send(); |
我最终围绕ListObjectsV2构建了一个包装器函数,以相同的方式工作,并采用了相同的参数,但递归工作直到IsTruncated = false并返回在回调函数的第二个参数中找到的所有键作为数组
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | const AWS = require('aws-sdk') const s3 = new AWS.S3() function listAllKeys(params, cb) { var keys = [] if(params.data){ keys = keys.concat(params.data) } delete params['data'] s3.listObjectsV2(params, function(err, data){ if(err){ cb(err) } else if (data.IsTruncated) { params['ContinuationToken'] = data.NextContinuationToken params['data'] = data.Contents listAllKeys(params, cb) } else { keys = keys.concat(data.Contents) cb(null,keys) } }) } |
如果您只想在S3存储桶中的特定文件夹中获取键列表,那么这将很有用。
基本上,
考虑到我的存储桶中有很多文件夹,前缀为
我只想在
1 2 3 4 5 6 7 | { Key: 'prod/2017/05/13/4bf2c675-a417-4c1f-a0b4-22fc45f99207.jpg', LastModified: 2017-05-13T00:59:02.000Z, ETag: '"630b2sdfsdfs49ef392bcc16c833004f94ae850"', Size: 134236366, StorageClass: 'STANDARD', Owner: { } } |
码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | var list = []; function listAllKeys(s3bucket, start, end) { s3.listObjects({ Bucket: s3bucket, Marker: start, MaxKeys: 1000, }, function(err, data) { if (data.Contents) { for (var i = 0; i < data.Contents.length; i++) { var key = data.Contents[i].Key; //See above code for the structure of data.Contents if (key.substring(0, 19) != end) { list.push(key); } else { break; // break the loop if end arrived } } console.log(list); console.log('Total - ', list.length); } }); } listAllKeys('BucketName', 'prod/2017/05/12/', 'prod/2017/05/13/'); |
输出:
1 2 3 4 5 6 | [ 'prod/2017/05/12/05/4bf2c675-a417-4c1f-a0b4-22fc45f99207.jpg', 'prod/2017/05/12/05/a36528b9-e071-4b83-a7e6-9b32d6bce6d8.jpg', 'prod/2017/05/12/05/bc4d6d4b-4455-48b3-a548-7a714c489060.jpg', 'prod/2017/05/12/05/f4b8d599-80d0-46fa-a996-e73b8fd0cd6d.jpg', ... 689 more items ] Total - 692 |
使用异步生成器
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | const { S3 } = require('aws-sdk'); const s3 = new S3(); async function* listAllKeys(opts) { do { const data = await s3.listObjectsV2(opts).promise(); opts.ContinuationToken = data.NextContinuationToken; yield data; } while (opts.ContinuationToken) } const opts = { Bucket: 'bucket-xyz', /* required */ // ContinuationToken: 'STRING_VALUE', // Delimiter: 'STRING_VALUE', // EncodingType: url, // FetchOwner: true || false, // MaxKeys: 'NUMBER_VALUE', // Prefix: 'STRING_VALUE', // RequestPayer: requester, // StartAfter: 'STRING_VALUE' }; async function main() { // using for of await loop for await (const data of listAllKeys(opts)) { console.log(data.Contents) } // or lazy-load const keys = listAllKeys(opts); console.log(await keys.next()); // {value: {…}, done: false} console.log(await keys.next()); // {value: {…}, done: false} console.log(await keys.next()); // {value: undefined, done: true} } main(); // Making Observable const lister = opts => o => { let needMore = true; (async () => { const keys = listAllKeys(opts); for await (const data of keys) { if (data.done) break; o.next(data); if (!needMore) break; } o.complete(); })(); return () => (needMore = false); } // Using Rxjs const { Observable } = require('rxjs'); const { flatMap } = require('rxjs/operators') function listAll() { return Observable.create(lister(opts)) .pipe(flatMap(v => v.Contents)) .subscribe(console.log); } listAll(); // Using Nodejs EventEmitter const EventEmitter = require('events'); const _eve = new EventEmitter(); _eve.on('next', console.log); const stop = lister(opts)({ next: v => _eve.emit('next', v), error: e => _eve.emit('error', e), complete: v => _eve.emit('complete', v) }); |
使用新的API
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | S3Dataset.prototype.listFiles = function(params,callback) { var self=this; var options = { }; for (var attrname in params) { options[attrname] = params[attrname]; } var results=[]; var s3=self.s3Store.GetInstance(); function listAllKeys(token, callback) { var opt={ Bucket: self._options.s3.Bucket, Prefix: self._options.s3.Key, MaxKeys: 1000 }; if(token) opt.ContinuationToken = token; s3.listObjectsV2(opt, (error, data) => { if (error) { if(self.logger) this.logger.error("listFiles error:", error); return callback(error); } else { for (var index in data.Contents) { var bucket = data.Contents[index]; if(self.logger) self.logger.debug("listFiles Key: %s LastModified: %s Size: %s", bucket.Key, bucket.LastModified, bucket.Size); if(bucket.Size>0) { var Bucket=self._options.s3.Bucket; var Key=bucket.Key; var components=bucket.Key.split('/'); var name=components[components.length-1]; results.push({ name: name, path: bucket.Key, mtime: bucket.LastModified, size: bucket.Size, sizehr: formatSizeUnits(bucket.Size) }); } } if( data.IsTruncated ) { // truncated page return listAllKeys(data.NextContinuationToken, callback); } else { return callback(null,results); } } }); } return listAllKeys.apply(this,['',callback]); }; |
哪里
1 2 3 4 5 6 7 8 9 10 | function formatSizeUnits(bytes){ if (bytes>=1099511627776) {bytes=(bytes/1099511627776).toFixed(4)+' PB';} else if (bytes>=1073741824) {bytes=(bytes/1073741824).toFixed(4)+' GB';} else if (bytes>=1048576) {bytes=(bytes/1048576).toFixed(4)+' MB';} else if (bytes>=1024) {bytes=(bytes/1024).toFixed(4)+' KB';} else if (bytes>1) {bytes=bytes+' bytes';} else if (bytes==1) {bytes=bytes+' byte';} else {bytes='0 byte';} return bytes; }//formatSizeUnits |
尽管@Meekohi的答案在技术上是可行的,但对于NodeJS的AWS开发工具包的S3部分,我还是感到非常头痛。在所有以前遇到诸如
就像是:
1 2 3 4 5 | var s3cmd = new cmd_exec('s3cmd', ['ls', filepath, 's3://'+inputBucket], function (me, data) {me.stdout += data.toString();}, function (me) {me.exit = 1;} ); response.send(s3cmd.stdout); |
(使用此问题的
这种方法非常有效-包括文件上传等其他问题。
对我而言,最干净的方法是通过从我的节点脚本执行s3cmd来执行此操作(此处的示例是递归删除文件):
1 2 3 4 5 6 7 8 9 10 11 | var exec = require('child_process').exec; var child; var bucket ="myBucket"; var prefix ="myPrefix"; // this parameter is optional var command ="s3cmd del -r s3://" + bucket +"/" + prefix; child = exec(command, {maxBuffer: 5000 * 1024}, function (error, stdout, stderr) { // the maxBuffer is here to avoid the maxBuffer node process error console.log('stdout: ' + stdout); if (error !== null) { console.log('exec error: ' + error); } }); |