Mercurial > dtrx / changeset

--- a/TODO	Fri Oct 19 23:06:53 2007 -0400
+++ b/TODO	Thu Nov 22 22:20:39 2007 -0500
@@ -1,16 +1,7 @@
-We should always extract to a new, temporary directory (except maybe in the
-straight decompression case), and then move that directory based on what we
-actually want.  This has several advantages:
-
-* Much easier to check whether or not the archive is a bomb (O(1) operation)
-* Can find other archives more reliably
-* Can set up a direct pipe from a decompressed to the unarchiver, since we're
-  not interested in reading it multiple times anymore.
-* All this should mean x is faster, too.
+* Make sure you only try each extractor once?

 Things which I have a use case/anti-use case for:
 * CAB extraction.
-* Use file to detect the archive type.
 * Support lzma compression (http://tukaani.org/lzma/download)
 * Support pisi packages (http://paketler.pardus.org.tr/pardus-2007/)
 * Steal ideas from <http://martin.ankerl.com/files/e>.
@@ -23,10 +14,8 @@
 * Better error messages.

 Things I think might be good but can't prove:
-* Take URLs as arguments.
 * Consider having options about whether or not to make sane directories,
   have tarbomb protection, etc.
 * Use zipfile instead of the zip commands.
 * Processing from stdin.
-* Extracting control.tar.gz from deb files.
 * shar support.
--- a/scripts/dtrx	Fri Oct 19 23:06:53 2007 -0400
+++ b/scripts/dtrx	Thu Nov 22 22:20:39 2007 -0500
@@ -62,7 +62,9 @@
 RECURSE_NEVER = 4

 mimetypes.encodings_map.setdefault('.bz2', 'bzip2')
-mimetypes.types_map['.exe'] = 'application/x-msdos-program'
+mimetypes.types_map.setdefault('.gem', 'x-ruby-gem')
+
+logger = logging.getLogger('dtrx-log')

 def run_command(command, description, stdout=None, stderr=None, stdin=None):
     process = subprocess.Popen(command, stdin=stdin, stdout=stdout,
@@ -174,7 +176,7 @@

     def check_included_archives(self, filenames):
         for filename in filenames:
-            if extractor_map.has_key(mimetypes.guess_type(filename)[0]):
+            if ExtractorBuilder.try_by_mimetype(filename)[0]:
                 self.included_archives.append(filename)

     def check_contents(self):
@@ -351,7 +353,6 @@

 class BaseHandler(object):
     def __init__(self, extractor, options):
-        self.logger = logging.getLogger('dtrx-log')
         self.extractor = extractor
         self.options = options
         self.target = None
@@ -389,9 +390,9 @@
                                                topdown=False):
             path_parts = curdir.split(os.sep)
             if path_parts[0] == '.':
-                path_parts.pop(1)
+                del path_parts[1]
             else:
-                path_parts.pop(0)
+                del path_parts[0]
             newdir = os.path.join(*path_parts)
             if not os.path.isdir(newdir):
                 os.makedirs(newdir)
@@ -514,7 +515,9 @@

     def __init__(self, options):
         BasePolicy.__init__(self, options)
-        if options.recursive:
+        if options.show_list:
+            self.permanent_policy = RECURSE_NEVER
+        elif options.recursive:
             self.permanent_policy = RECURSE_ALWAYS

     def prep(self, current_filename, included_archives):
@@ -536,18 +539,6 @@
         return self.current_policy in (RECURSE_ALWAYS, RECURSE_ONCE)


-extractor_map = {'application/x-tar': TarExtractor,
-                 'application/zip': ZipExtractor,
-                 'application/x-msdos-program': ZipExtractor,
-                 'application/x-debian-package': DebExtractor,
-                 'application/x-redhat-package-manager': RPMExtractor,
-                 'application/x-rpm': RPMExtractor,
-                 'application/x-cpio': CpioExtractor,
-                 'application/x-ruby-gem': GemExtractor}
-
-handlers = [FlatHandler, OverwriteHandler, MatchHandler, EmptyHandler,
-            BombHandler]
-
 class ExtractorBuilder(object):
     extractor_map = {'tar': (TarExtractor, None),
                      'zip': (ZipExtractor, None),
@@ -584,6 +575,19 @@
         for pattern in mapping[1:]:
             magic_encoding_map[re.compile(pattern)] = mapping[0]

+    extension_map = {}
+    for mapping in (('tar', 'bzip2', 'tar.bz2'),
+                    ('tar', 'gzip', 'tar.gz', 'tgz'),
+                    ('tar', None, 'tar'),
+                    ('zip', None, 'zip', 'exe'),
+                    ('deb', None, 'deb'),
+                    ('rpm', None, 'rpm'),
+                    ('cpio', None, 'cpio'),
+                    ('gem', None, 'gem'),
+                    ('compress', None, 'gz', 'bz2')):
+        for extension in mapping[2:]:
+            extension_map[extension] = mapping[:2]
+
     def __init__(self, filename, options):
         self.filename = filename
         self.options = options
@@ -597,38 +601,117 @@
         return extractor(self.filename, encoding)

     def get_extractor(self):
-        for func_name in ('mimetype', 'magic'):
-            archive_type, encoding = getattr(self, 'try_by_' + func_name)()
+        for func_name in ('mimetype', 'extension', 'magic'):
+            archive_type, encoding = \
+                          getattr(self, 'try_by_' + func_name)(self.filename)
             if archive_type is not None:
                 yield self.build_extractor(archive_type, encoding)

-    def try_by_mimetype(self):
-        mimetype, encoding = mimetypes.guess_type(self.filename)
+    def try_by_mimetype(cls, filename):
+        mimetype, encoding = mimetypes.guess_type(filename)
         try:
-            return self.mimetype_map[mimetype], encoding
+            return cls.mimetype_map[mimetype], encoding
         except KeyError:
             if encoding:
                 return 'compress', encoding
         return None, None
+    try_by_mimetype = classmethod(try_by_mimetype)

-    def try_by_magic(self):
-        process = subprocess.Popen(['file', '-z', self.filename],
+    def try_by_magic(cls, filename):
+        process = subprocess.Popen(['file', '-z', filename],
                                    stdout=subprocess.PIPE)
         status = process.wait()
         if status != 0:
             return None, None
         output = process.stdout.readline()
         process.stdout.close()
-        if output.startswith('%s: ' % self.filename):
-            output = output[len(self.filename) + 2:]
+        if output.startswith('%s: ' % filename):
+            output = output[len(filename) + 2:]
         results = [None, None]
-        for index, mapping in enumerate((self.magic_mime_map,
-                                         self.magic_encoding_map)):
+        for index, mapping in enumerate((cls.magic_mime_map,
+                                         cls.magic_encoding_map)):
             for regexp, result in mapping.items():
                 if regexp.search(output):
                     results[index] = result
                     break
         return results
+    try_by_magic = classmethod(try_by_magic)
+
+    def try_by_extension(cls, filename):
+        parts = filename.rsplit('.', 2)[1:]
+        while parts:
+            try:
+                return cls.extension_map['.'.join(parts)]
+            except KeyError:
+                del parts[0]
+        return [None, None]
+    try_by_extension = classmethod(try_by_extension)
+
+
+class BaseAction(object):
+    def __init__(self, options, filenames):
+        self.options = options
+        self.filenames = filenames
+        self.target = None
+
+    def report(self, function, *args):
+        try:
+            error = function(*args)
+        except (ExtractorError, IOError, OSError), exception:
+            error = str(exception)
+            logger.debug(traceback.format_exception(*sys.exc_info()))
+        if error:
+            logger.error("%s: %s", self.current_filename, error)
+            return False
+        return True
+
+
+class ExtractionAction(BaseAction):
+    handlers = [FlatHandler, OverwriteHandler, MatchHandler, EmptyHandler,
+                BombHandler]
+
+    def get_handler(self, extractor):
+        if extractor.content_type == ONE_ENTRY:
+            self.options.one_entry_policy.prep(self.current_filename,
+                                               extractor.content_name)
+        for handler in self.handlers:
+            if handler.can_handle(extractor.content_type, self.options):
+                self.current_handler = handler(extractor, self.options)
+                break
+
+    def run(self, filename, extractor):
+        self.current_filename = filename
+        success = (self.report(extractor.extract) and
+                   self.report(self.get_handler, extractor) and
+                   self.report(self.current_handler.handle))
+        if success:
+            self.target = self.current_handler.target
+        return success
+
+
+class ListAction(BaseAction):
+    def __init__(self, options, filenames):
+        BaseAction.__init__(self, options, filenames)
+
+    def get_list(self, extractor):
+        # Note: The reason I'm getting all the filenames up front is
+        # because if we run into trouble partway through the archive, we'll
+        # try another extractor.  So before we display anything we have to
+        # be sure this one is successful.  We maybe don't have to be quite
+        # this conservative but this is the easy way out for now.
+        self.filelist = list(extractor.get_filenames())
+
+    def show_list(self, filename):
+        if len(self.filenames) != 1:
+            if filename != self.filenames[0]:
+                print
+            print "%s:" % (filename,)
+        print '\n'.join(self.filelist)
+
+    def run(self, filename, extractor):
+        self.current_filename = filename
+        return (self.report(self.get_list, extractor) and
+                self.report(self.show_list, filename))


 class ExtractorApplication(object):
@@ -676,98 +759,41 @@
         self.archives = {os.path.realpath(os.curdir): filenames}

     def setup_logger(self):
-        self.logger = logging.getLogger('dtrx-log')
         handler = logging.StreamHandler()
         # WARNING is the default.
         handler.setLevel(10 * (self.options.quiet - self.options.verbose))
         formatter = logging.Formatter("dtrx: %(levelname)s: %(message)s")
         handler.setFormatter(formatter)
-        self.logger.addHandler(handler)
+        logger.addHandler(handler)

-    def get_handler(self):
-        for var_name in ('type', 'name'):
-            exec('content_%s = self.current_extractor.content_%s' %
-                 (var_name, var_name))
-        if content_type == ONE_ENTRY:
-            self.options.one_entry_policy.prep(self.current_filename,
-                                               content_name)
-        for handler in handlers:
-            if handler.can_handle(content_type, self.options):
-                self.current_handler = handler(self.current_extractor,
-                                               self.options)
-                break
-
-    def recurse(self):
-        archives = self.current_extractor.included_archives
-        self.options.recursion_policy.prep(self.current_filename, archives)
+    def recurse(self, filename, extractor, action):
+        archives = extractor.included_archives
+        self.options.recursion_policy.prep(filename, archives)
         if self.options.recursion_policy.ok_to_recurse():
             for filename in archives:
                 tail_path, basename = os.path.split(filename)
                 directory = os.path.join(self.current_directory,
-                                         self.current_handler.target, tail_path)
+                                         action.target, tail_path)
                 self.archives.setdefault(directory, []).append(basename)

-    def report(self, function, *args):
-        try:
-            error = function(*args)
-        except (ExtractorError, IOError, OSError), exception:
-            error = str(exception)
-            self.logger.debug(traceback.format_exception(*sys.exc_info()))
-        if error:
-            self.logger.error("%s: %s", self.current_filename, error)
-            return False
-        return True
-
-    def record_status(self, success):
-        if success:
-            self.successes.append(self.current_filename)
-        else:
-            self.failures.append(self.current_filename)
-
-    def extract(self):
-        success = (self.report(self.current_extractor.extract) and
-                   self.report(self.get_handler) and
-                   self.report(self.current_handler.handle))
-        if success:
-            self.recurse()
-        return success
-
-    def show_contents(self):
-        for filename in self.current_extractor.get_filenames():
-            print filename
-
-    def make_list(self):
-        if len(self.archives.values()[0]) == 1:
-            def show_list():
-                return self.report(self.show_contents)
-        else:
-            def show_list():
-                if self.current_filename == self.filenames[0]:
-                    print "%s:\n" % (self.current_filename,),
-                else:
-                    print "\n%s:\n" % (self.current_filename,),
-                return self.report(self.show_contents)
-        return show_list
-
     def run(self):
         if self.options.show_list:
-            action_function = self.make_list()
+            action = ListAction
         else:
-            action_function = self.extract
+            action = ExtractionAction
+        action = action(self.options, self.archives.values()[0])
         while self.archives:
             self.current_directory, self.filenames = self.archives.popitem()
             os.chdir(self.current_directory)
             for filename in self.filenames:
-                self.current_filename = filename
-                builder = ExtractorBuilder(self.current_filename, self.options)
+                builder = ExtractorBuilder(filename, self.options)
                 for extractor in builder.get_extractor():
-                    self.current_extractor = extractor
-                    success = action_function()
-                    if success:
-                        self.record_status(success)
+                    if action.run(filename, extractor):
+                        self.successes.append(filename)
+                        self.recurse(filename, extractor, action)
                         break
                 else:
-                    self.record_status(success=False)
+                    self.failures.append(filename)
             self.options.one_entry_policy.permanent_policy = EXTRACT_WRAP
         if self.failures:
             return 1
--- a/tests/tests.yml	Fri Oct 19 23:06:53 2007 -0400
+++ b/tests/tests.yml	Thu Nov 22 22:20:39 2007 -0500
@@ -36,6 +36,11 @@
     cd test-1.23
     tar -xOf ../$1 data.tar.gz | tar -zx

+- name: basic .7z
+  filenames: test-1.23.7z
+  baseline: |
+    7z x $1
+
 - name: .deb metadata
   filenames: test-1.23_all.deb
   options: --metadata
TODO		file \| annotate \| diff \| comparison \| revisions
scripts/dtrx		file \| annotate \| diff \| comparison \| revisions
tests/tests.yml		file \| annotate \| diff \| comparison \| revisions