scripts/dtrx

branch
trunk
changeset 35
957b402d4b90
parent 34
a8f875e02c83
child 36
4bf2508d9b9e
equal deleted inserted replaced
34:a8f875e02c83 35:957b402d4b90
174 def prepare(self): 174 def prepare(self):
175 pass 175 pass
176 176
177 def check_included_archives(self, filenames): 177 def check_included_archives(self, filenames):
178 for filename in filenames: 178 for filename in filenames:
179 if (ExtractorBuilder.try_by_mimetype(filename)[0] or 179 if (ExtractorBuilder.try_by_mimetype(filename) or
180 ExtractorBuilder.try_by_extension(filename)[0]): 180 ExtractorBuilder.try_by_extension(filename)):
181 self.included_archives.append(filename) 181 self.included_archives.append(filename)
182 182
183 def check_contents(self): 183 def check_contents(self):
184 filenames = os.listdir('.') 184 filenames = os.listdir('.')
185 if not filenames: 185 if not filenames:
248 248
249 def extract(self): 249 def extract(self):
250 self.content_type = ONE_ENTRY_KNOWN 250 self.content_type = ONE_ENTRY_KNOWN
251 self.content_name = self.basename() 251 self.content_name = self.basename()
252 output_fd, self.target = tempfile.mkstemp(prefix='.dtrx-', dir='.') 252 output_fd, self.target = tempfile.mkstemp(prefix='.dtrx-', dir='.')
253 self.run_pipes(output_fd) 253 try:
254 self.run_pipes(output_fd)
255 except ExtractorError:
256 os.close(output_fd)
257 os.unlink(self.target)
258 raise
254 os.close(output_fd) 259 os.close(output_fd)
255 260
256 261
257 class TarExtractor(BaseExtractor): 262 class TarExtractor(BaseExtractor):
258 def get_filenames(self): 263 def get_filenames(self):
262 def extract_archive(self): 267 def extract_archive(self):
263 self.pipe(['tar', '-x']) 268 self.pipe(['tar', '-x'])
264 self.run_pipes() 269 self.run_pipes()
265 270
266 271
267 class ZipExtractor(BaseExtractor):
268 def __init__(self, filename, encoding):
269 BaseExtractor.__init__(self, '/dev/null', None)
270 self.filename = os.path.realpath(filename)
271
272 def get_filenames(self):
273 self.pipe(['zipinfo', '-1', self.filename], "listing")
274 return BaseExtractor.get_filenames(self)
275
276 def extract_archive(self):
277 self.pipe(['unzip', '-q', self.filename])
278 self.run_pipes()
279
280
281 class CpioExtractor(BaseExtractor): 272 class CpioExtractor(BaseExtractor):
282 def get_filenames(self): 273 def get_filenames(self):
283 self.pipe(['cpio', '-t'], "listing") 274 self.pipe(['cpio', '-t'], "listing")
284 return BaseExtractor.get_filenames(self) 275 return BaseExtractor.get_filenames(self)
285 276
355 346
356 def basename(self): 347 def basename(self):
357 return os.path.basename(self.filename) + '-metadata.txt' 348 return os.path.basename(self.filename) + '-metadata.txt'
358 349
359 350
360 class SevenExtractor(BaseExtractor): 351 class NoPipeExtractor(BaseExtractor):
361 border_re = re.compile('^[- ]+$') 352 # Some extraction tools won't accept the archive from stdin. With
362 353 # these, the piping infrastructure we normally set up generally doesn't
354 # work, at least at first. We can still use most of it; we just can't
355 # seed self.archive with the archive file. So instead we seed it with
356 # /dev/null, and specify the filename on the command line as necessary.
357 # This class doesn't do anything by itself; it's just meant to be a
358 # base class for extractors that rely on these dumb tools.
363 def __init__(self, filename, encoding): 359 def __init__(self, filename, encoding):
364 BaseExtractor.__init__(self, '/dev/null', None) 360 BaseExtractor.__init__(self, '/dev/null', None)
365 self.filename = os.path.realpath(filename) 361 self.filename = os.path.realpath(filename)
362
363
364 class ZipExtractor(NoPipeExtractor):
365 def get_filenames(self):
366 self.pipe(['zipinfo', '-1', self.filename], "listing")
367 return BaseExtractor.get_filenames(self)
368
369 def extract_archive(self):
370 self.pipe(['unzip', '-q', self.filename])
371 self.run_pipes()
372
373
374 class SevenExtractor(NoPipeExtractor):
375 border_re = re.compile('^[- ]+$')
366 376
367 def get_filenames(self): 377 def get_filenames(self):
368 self.pipe(['7z', 'l', self.filename], "listing") 378 self.pipe(['7z', 'l', self.filename], "listing")
369 self.run_pipes() 379 self.run_pipes()
370 self.archive.seek(0, 0) 380 self.archive.seek(0, 0)
381 391
382 def extract_archive(self): 392 def extract_archive(self):
383 self.pipe(['7z', 'x', self.filename]) 393 self.pipe(['7z', 'x', self.filename])
384 self.run_pipes() 394 self.run_pipes()
385 395
396
397 class CABExtractor(NoPipeExtractor):
398 border_re = re.compile(r'^[-\+]+$')
399
400 def get_filenames(self):
401 self.pipe(['cabextract', '-l', self.filename], "listing")
402 self.run_pipes()
403 self.archive.seek(0, 0)
404 fn_index = None
405 for line in self.archive:
406 if self.border_re.match(line):
407 break
408 for line in self.archive:
409 try:
410 yield line.split(' | ', 2)[2].rstrip('\n')
411 except IndexError:
412 break
413 self.archive.close()
414
415 def extract_archive(self):
416 self.pipe(['cabextract', '-q', self.filename])
417 self.run_pipes()
418
386 419
387 class BaseHandler(object): 420 class BaseHandler(object):
388 def __init__(self, extractor, options): 421 def __init__(self, extractor, options):
389 self.extractor = extractor 422 self.extractor = extractor
390 self.options = options 423 self.options = options
456 ((contents == ONE_ENTRY) and 489 ((contents == ONE_ENTRY) and
457 options.one_entry_policy.ok_for_match())) 490 options.one_entry_policy.ok_for_match()))
458 can_handle = staticmethod(can_handle) 491 can_handle = staticmethod(can_handle)
459 492
460 def organize(self): 493 def organize(self):
494 source = os.path.join(self.extractor.target,
495 os.listdir(self.extractor.target)[0])
496 if os.path.isdir(source):
497 checker = DirectoryChecker
498 else:
499 checker = FilenameChecker
461 if self.options.one_entry_policy == EXTRACT_HERE: 500 if self.options.one_entry_policy == EXTRACT_HERE:
462 destination = self.extractor.content_name.rstrip('/') 501 destination = self.extractor.content_name.rstrip('/')
463 else: 502 else:
464 destination = self.extractor.basename() 503 destination = self.extractor.basename()
465 self.target = self.extractor.name_checker(destination).check() 504 self.target = checker(destination).check()
466 if os.path.isdir(self.extractor.target): 505 if os.path.isdir(self.extractor.target):
467 os.rename(os.path.join(self.extractor.target, 506 os.rename(source, self.target)
468 os.listdir(self.extractor.target)[0]),
469 self.target)
470 os.rmdir(self.extractor.target) 507 os.rmdir(self.extractor.target)
471 else: 508 else:
472 os.rename(self.extractor.target, self.target) 509 os.rename(self.extractor.target, self.target)
473 510
474 511
578 'deb': (DebExtractor, DebMetadataExtractor), 615 'deb': (DebExtractor, DebMetadataExtractor),
579 'rpm': (RPMExtractor, None), 616 'rpm': (RPMExtractor, None),
580 'cpio': (CpioExtractor, None), 617 'cpio': (CpioExtractor, None),
581 'gem': (GemExtractor, GemMetadataExtractor), 618 'gem': (GemExtractor, GemMetadataExtractor),
582 'compress': (CompressionExtractor, None), 619 'compress': (CompressionExtractor, None),
583 '7z': (SevenExtractor, None)} 620 '7z': (SevenExtractor, None),
621 'cab': (CABExtractor, None)}
584 622
585 mimetype_map = {} 623 mimetype_map = {}
586 for mapping in (('tar', 'x-tar'), 624 for mapping in (('tar', 'x-tar'),
587 ('zip', 'x-msdos-program', 'zip'), 625 ('zip', 'x-msdos-program', 'zip'),
588 ('deb', 'x-debian-package'), 626 ('deb', 'x-debian-package'),
589 ('rpm', 'x-redhat-package-manager', 'x-rpm'), 627 ('rpm', 'x-redhat-package-manager', 'x-rpm'),
590 ('cpio', 'x-cpio'), 628 ('cpio', 'x-cpio'),
591 ('gem', 'x-ruby-gem'), 629 ('gem', 'x-ruby-gem'),
592 ('7z', 'x-7z-compressed')): 630 ('7z', 'x-7z-compressed'),
631 ('cab', 'x-cab')):
593 for mimetype in mapping[1:]: 632 for mimetype in mapping[1:]:
594 if '/' not in mimetype: 633 if '/' not in mimetype:
595 mimetype = 'application/' + mimetype 634 mimetype = 'application/' + mimetype
596 mimetype_map[mimetype] = mapping[0] 635 mimetype_map[mimetype] = mapping[0]
597 636
599 for mapping in (('deb', 'Debian binary package'), 638 for mapping in (('deb', 'Debian binary package'),
600 ('cpio', 'cpio archive'), 639 ('cpio', 'cpio archive'),
601 ('tar', 'POSIX tar archive'), 640 ('tar', 'POSIX tar archive'),
602 ('zip', 'Zip archive'), 641 ('zip', 'Zip archive'),
603 ('rpm', 'RPM'), 642 ('rpm', 'RPM'),
604 ('7z', '7-zip archive')): 643 ('7z', '7-zip archive'),
644 ('cab', 'Microsoft Cabinet archive')):
605 for pattern in mapping[1:]: 645 for pattern in mapping[1:]:
606 magic_mime_map[re.compile(pattern)] = mapping[0] 646 magic_mime_map[re.compile(pattern)] = mapping[0]
607 647
608 magic_encoding_map = {} 648 magic_encoding_map = {}
609 for mapping in (('bzip2', 'bzip2 compressed'), 649 for mapping in (('bzip2', 'bzip2 compressed'),
618 ('zip', None, 'zip', 'exe'), 658 ('zip', None, 'zip', 'exe'),
619 ('deb', None, 'deb'), 659 ('deb', None, 'deb'),
620 ('rpm', None, 'rpm'), 660 ('rpm', None, 'rpm'),
621 ('cpio', None, 'cpio'), 661 ('cpio', None, 'cpio'),
622 ('gem', None, 'gem'), 662 ('gem', None, 'gem'),
623 ('compress', None, 'Z', 'gz', 'bz2', 'lzma'), 663 ('compress', 'gzip', 'Z', 'gz'),
624 ('7z', None, '7z')): 664 ('compress', 'bzip2', 'bz2'),
665 ('compress', 'lzma', 'lzma'),
666 ('7z', None, '7z'),
667 ('cab', None, 'cab', 'exe')):
625 for extension in mapping[2:]: 668 for extension in mapping[2:]:
626 extension_map[extension] = mapping[:2] 669 extension_map.setdefault(extension, []).append(mapping[:2])
627 670
628 def __init__(self, filename, options): 671 def __init__(self, filename, options):
629 self.filename = filename 672 self.filename = filename
630 self.options = options 673 self.options = options
631 674
637 extractor = extractors[0] 680 extractor = extractors[0]
638 return extractor(self.filename, encoding) 681 return extractor(self.filename, encoding)
639 682
640 def get_extractor(self): 683 def get_extractor(self):
641 for func_name in ('mimetype', 'extension', 'magic'): 684 for func_name in ('mimetype', 'extension', 'magic'):
642 archive_type, encoding = \ 685 logger.debug("getting extractors by %s" % (func_name,))
643 getattr(self, 'try_by_' + func_name)(self.filename) 686 extractor_types = \
644 logger.debug("%s extractor is %s, %s" % 687 getattr(self, 'try_by_' + func_name)(self.filename)
645 (func_name, archive_type, encoding)) 688 logger.debug("done getting extractors")
646 if archive_type is not None: 689 for ext_args in extractor_types:
647 yield self.build_extractor(archive_type, encoding) 690 logger.debug("trying %s extractor from %s" %
691 (ext_args, func_name))
692 yield self.build_extractor(*ext_args)
648 693
649 def try_by_mimetype(cls, filename): 694 def try_by_mimetype(cls, filename):
650 mimetype, encoding = mimetypes.guess_type(filename) 695 mimetype, encoding = mimetypes.guess_type(filename)
651 try: 696 try:
652 return cls.mimetype_map[mimetype], encoding 697 return [(cls.mimetype_map[mimetype], encoding)]
653 except KeyError: 698 except KeyError:
654 if encoding: 699 if encoding:
655 return 'compress', encoding 700 return [('compress', encoding)]
656 return None, None 701 return []
657 try_by_mimetype = classmethod(try_by_mimetype) 702 try_by_mimetype = classmethod(try_by_mimetype)
658 703
704 def magic_map_matches(cls, output, magic_map):
705 return [result for regexp, result in magic_map.items()
706 if regexp.search(output)]
707 magic_map_matches = classmethod(magic_map_matches)
708
659 def try_by_magic(cls, filename): 709 def try_by_magic(cls, filename):
660 process = subprocess.Popen(['file', '-z', filename], 710 process = subprocess.Popen(['file', '-z', filename],
661 stdout=subprocess.PIPE) 711 stdout=subprocess.PIPE)
662 status = process.wait() 712 status = process.wait()
663 if status != 0: 713 if status != 0:
664 return None, None 714 return []
665 output = process.stdout.readline() 715 output = process.stdout.readline()
666 process.stdout.close() 716 process.stdout.close()
667 if output.startswith('%s: ' % filename): 717 if output.startswith('%s: ' % filename):
668 output = output[len(filename) + 2:] 718 output = output[len(filename) + 2:]
669 results = [None, None] 719 mimes = cls.magic_map_matches(output, cls.magic_mime_map)
670 for index, mapping in enumerate((cls.magic_mime_map, 720 encodings = cls.magic_map_matches(output, cls.magic_encoding_map)
671 cls.magic_encoding_map)): 721 if mimes and not encodings:
672 for regexp, result in mapping.items(): 722 encodings = [None]
673 if regexp.search(output): 723 elif encodings and not mimes:
674 results[index] = result 724 mimes = ['compress']
675 break 725 return [(m, e) for m in mimes for e in encodings]
676 return results
677 try_by_magic = classmethod(try_by_magic) 726 try_by_magic = classmethod(try_by_magic)
678 727
679 def try_by_extension(cls, filename): 728 def try_by_extension(cls, filename):
680 parts = filename.rsplit('.', 2)[1:] 729 parts = filename.rsplit('.', 2)[1:]
730 results = []
681 while parts: 731 while parts:
682 try: 732 results.extend(cls.extension_map.get('.'.join(parts), []))
683 return cls.extension_map['.'.join(parts)] 733 del parts[0]
684 except KeyError: 734 return results
685 del parts[0]
686 return [None, None]
687 try_by_extension = classmethod(try_by_extension) 735 try_by_extension = classmethod(try_by_extension)
688 736
689 737
690 class BaseAction(object): 738 class BaseAction(object):
691 def __init__(self, options, filenames): 739 def __init__(self, options, filenames):
713 if extractor.content_type == ONE_ENTRY: 761 if extractor.content_type == ONE_ENTRY:
714 self.options.one_entry_policy.prep(self.current_filename, 762 self.options.one_entry_policy.prep(self.current_filename,
715 extractor.content_name) 763 extractor.content_name)
716 for handler in self.handlers: 764 for handler in self.handlers:
717 if handler.can_handle(extractor.content_type, self.options): 765 if handler.can_handle(extractor.content_type, self.options):
766 logger.debug("using %s handler" % (handler.__name__,))
718 self.current_handler = handler(extractor, self.options) 767 self.current_handler = handler(extractor, self.options)
719 break 768 break
720 769
721 def run(self, filename, extractor): 770 def run(self, filename, extractor):
722 self.current_filename = filename 771 self.current_filename = filename
787 parser.add_option('-n', '--noninteractive', dest='batch', 836 parser.add_option('-n', '--noninteractive', dest='batch',
788 action='store_true', default=False, 837 action='store_true', default=False,
789 help="don't ask how to handle special cases") 838 help="don't ask how to handle special cases")
790 parser.add_option('-m', '--metadata', dest='metadata', 839 parser.add_option('-m', '--metadata', dest='metadata',
791 action='store_true', default=False, 840 action='store_true', default=False,
792 help="extract metadata from a .deb/.gem/etc.") 841 help="extract metadata from a .deb/.gem")
793 self.options, filenames = parser.parse_args(arguments) 842 self.options, filenames = parser.parse_args(arguments)
794 if not filenames: 843 if not filenames:
795 parser.error("you did not list any archives") 844 parser.error("you did not list any archives")
796 self.options.one_entry_policy = OneEntryPolicy(self.options) 845 self.options.one_entry_policy = OneEntryPolicy(self.options)
797 self.options.recursion_policy = RecursionPolicy(self.options) 846 self.options.recursion_policy = RecursionPolicy(self.options)

mercurial