[svn] Add support for extracting CAB archives. Because the CAB archive I was trunk

Fri, 23 Nov 2007 11:24:58 -0500

author
brett
date
Fri, 23 Nov 2007 11:24:58 -0500
branch
trunk
changeset 35
957b402d4b90
parent 34
a8f875e02c83
child 36
4bf2508d9b9e

[svn] Add support for extracting CAB archives. Because the CAB archive I was
testing on had a single file in it, I fonud a bunch of bugs related to
extracting one file (as opposed to one directory) here, and squashed those
in the process.

TODO file | annotate | diff | comparison | revisions
scripts/dtrx file | annotate | diff | comparison | revisions
tests/test-onefile.tar.gz file | annotate | diff | comparison | revisions
tests/tests.yml file | annotate | diff | comparison | revisions
--- a/TODO	Thu Nov 22 22:37:40 2007 -0500
+++ b/TODO	Fri Nov 23 11:24:58 2007 -0500
@@ -1,8 +1,6 @@
 * Make sure you only try each extractor once?
 
 Things which I have a use case/anti-use case for:
-* CAB extraction.
-* Support lzma compression (http://tukaani.org/lzma/download)
 * Support pisi packages (http://paketler.pardus.org.tr/pardus-2007/)
 * Steal ideas from <http://martin.ankerl.com/files/e>.
 * Figure out what the deal is with strerror. (done?)
--- a/scripts/dtrx	Thu Nov 22 22:37:40 2007 -0500
+++ b/scripts/dtrx	Fri Nov 23 11:24:58 2007 -0500
@@ -176,8 +176,8 @@
 
     def check_included_archives(self, filenames):
         for filename in filenames:
-            if (ExtractorBuilder.try_by_mimetype(filename)[0] or
-                ExtractorBuilder.try_by_extension(filename)[0]):
+            if (ExtractorBuilder.try_by_mimetype(filename) or
+                ExtractorBuilder.try_by_extension(filename)):
                 self.included_archives.append(filename)
 
     def check_contents(self):
@@ -250,7 +250,12 @@
         self.content_type = ONE_ENTRY_KNOWN
         self.content_name = self.basename()
         output_fd, self.target = tempfile.mkstemp(prefix='.dtrx-', dir='.')
-        self.run_pipes(output_fd)
+        try:
+            self.run_pipes(output_fd)
+        except ExtractorError:
+            os.close(output_fd)
+            os.unlink(self.target)
+            raise
         os.close(output_fd)
         
 
@@ -264,20 +269,6 @@
         self.run_pipes()
         
         
-class ZipExtractor(BaseExtractor):
-    def __init__(self, filename, encoding):
-        BaseExtractor.__init__(self, '/dev/null', None)
-        self.filename = os.path.realpath(filename)
-
-    def get_filenames(self):
-        self.pipe(['zipinfo', '-1', self.filename], "listing")
-        return BaseExtractor.get_filenames(self)
-
-    def extract_archive(self):
-        self.pipe(['unzip', '-q', self.filename])
-        self.run_pipes()
-
-
 class CpioExtractor(BaseExtractor):
     def get_filenames(self):
         self.pipe(['cpio', '-t'], "listing")
@@ -357,13 +348,32 @@
         return os.path.basename(self.filename) + '-metadata.txt'
 
 
-class SevenExtractor(BaseExtractor):
-    border_re = re.compile('^[- ]+$')
-
+class NoPipeExtractor(BaseExtractor):
+    # Some extraction tools won't accept the archive from stdin.  With
+    # these, the piping infrastructure we normally set up generally doesn't
+    # work, at least at first.  We can still use most of it; we just can't
+    # seed self.archive with the archive file.  So instead we seed it with
+    # /dev/null, and specify the filename on the command line as necessary.
+    # This class doesn't do anything by itself; it's just meant to be a
+    # base class for extractors that rely on these dumb tools.
     def __init__(self, filename, encoding):
         BaseExtractor.__init__(self, '/dev/null', None)
         self.filename = os.path.realpath(filename)
 
+
+class ZipExtractor(NoPipeExtractor):
+    def get_filenames(self):
+        self.pipe(['zipinfo', '-1', self.filename], "listing")
+        return BaseExtractor.get_filenames(self)
+
+    def extract_archive(self):
+        self.pipe(['unzip', '-q', self.filename])
+        self.run_pipes()
+
+
+class SevenExtractor(NoPipeExtractor):
+    border_re = re.compile('^[- ]+$')
+
     def get_filenames(self):
         self.pipe(['7z', 'l', self.filename], "listing")
         self.run_pipes()
@@ -384,6 +394,29 @@
         self.run_pipes()
         
 
+class CABExtractor(NoPipeExtractor):
+    border_re = re.compile(r'^[-\+]+$')
+
+    def get_filenames(self):
+        self.pipe(['cabextract', '-l', self.filename], "listing")
+        self.run_pipes()
+        self.archive.seek(0, 0)
+        fn_index = None
+        for line in self.archive:
+            if self.border_re.match(line):
+                break
+        for line in self.archive:
+            try:
+                yield line.split(' | ', 2)[2].rstrip('\n')
+            except IndexError:
+                break
+        self.archive.close()
+
+    def extract_archive(self):
+        self.pipe(['cabextract', '-q', self.filename])
+        self.run_pipes()
+
+
 class BaseHandler(object):
     def __init__(self, extractor, options):
         self.extractor = extractor
@@ -458,15 +491,19 @@
     can_handle = staticmethod(can_handle)
 
     def organize(self):
+        source = os.path.join(self.extractor.target,
+                              os.listdir(self.extractor.target)[0])
+        if os.path.isdir(source):
+            checker = DirectoryChecker
+        else:
+            checker = FilenameChecker
         if self.options.one_entry_policy == EXTRACT_HERE:
             destination = self.extractor.content_name.rstrip('/')
         else:
             destination = self.extractor.basename()
-        self.target = self.extractor.name_checker(destination).check()
+        self.target = checker(destination).check()
         if os.path.isdir(self.extractor.target):
-            os.rename(os.path.join(self.extractor.target,
-                                   os.listdir(self.extractor.target)[0]),
-                      self.target)
+            os.rename(source, self.target)
             os.rmdir(self.extractor.target)
         else:
             os.rename(self.extractor.target, self.target)
@@ -580,7 +617,8 @@
                      'cpio': (CpioExtractor, None),
                      'gem': (GemExtractor, GemMetadataExtractor),
                      'compress': (CompressionExtractor, None),
-                     '7z': (SevenExtractor, None)}
+                     '7z': (SevenExtractor, None),
+                     'cab': (CABExtractor, None)}
 
     mimetype_map = {}
     for mapping in (('tar', 'x-tar'),
@@ -589,7 +627,8 @@
                     ('rpm', 'x-redhat-package-manager', 'x-rpm'),
                     ('cpio', 'x-cpio'),
                     ('gem', 'x-ruby-gem'),
-                    ('7z', 'x-7z-compressed')):
+                    ('7z', 'x-7z-compressed'),
+                    ('cab', 'x-cab')):
         for mimetype in mapping[1:]:
             if '/' not in mimetype:
                 mimetype = 'application/' + mimetype
@@ -601,7 +640,8 @@
                     ('tar', 'POSIX tar archive'),
                     ('zip', 'Zip archive'),
                     ('rpm', 'RPM'),
-                    ('7z', '7-zip archive')):
+                    ('7z', '7-zip archive'),
+                    ('cab', 'Microsoft Cabinet archive')):
         for pattern in mapping[1:]:
             magic_mime_map[re.compile(pattern)] = mapping[0]
     
@@ -620,10 +660,13 @@
                     ('rpm', None, 'rpm'),
                     ('cpio', None, 'cpio'),
                     ('gem', None, 'gem'),
-                    ('compress', None, 'Z', 'gz', 'bz2', 'lzma'),
-                    ('7z', None, '7z')):
+                    ('compress', 'gzip', 'Z', 'gz'),
+                    ('compress', 'bzip2', 'bz2'),
+                    ('compress', 'lzma', 'lzma'),
+                    ('7z', None, '7z'),
+                    ('cab', None, 'cab', 'exe')):
         for extension in mapping[2:]:
-            extension_map[extension] = mapping[:2]
+            extension_map.setdefault(extension, []).append(mapping[:2])
 
     def __init__(self, filename, options):
         self.filename = filename
@@ -639,51 +682,56 @@
 
     def get_extractor(self):
         for func_name in ('mimetype', 'extension', 'magic'):
-            archive_type, encoding = \
-                          getattr(self, 'try_by_' + func_name)(self.filename)
-            logger.debug("%s extractor is %s, %s" %
-                         (func_name, archive_type, encoding))
-            if archive_type is not None:
-                yield self.build_extractor(archive_type, encoding)
+            logger.debug("getting extractors by %s" % (func_name,))
+            extractor_types = \
+                            getattr(self, 'try_by_' + func_name)(self.filename)
+            logger.debug("done getting extractors")
+            for ext_args in extractor_types:
+                logger.debug("trying %s extractor from %s" %
+                             (ext_args, func_name))
+                yield self.build_extractor(*ext_args)
 
     def try_by_mimetype(cls, filename):
         mimetype, encoding = mimetypes.guess_type(filename)
         try:
-            return cls.mimetype_map[mimetype], encoding
+            return [(cls.mimetype_map[mimetype], encoding)]
         except KeyError:
             if encoding:
-                return 'compress', encoding
-        return None, None
+                return [('compress', encoding)]
+        return []
     try_by_mimetype = classmethod(try_by_mimetype)
 
+    def magic_map_matches(cls, output, magic_map):
+        return [result for regexp, result in magic_map.items()
+                if regexp.search(output)]
+    magic_map_matches = classmethod(magic_map_matches)
+        
     def try_by_magic(cls, filename):
         process = subprocess.Popen(['file', '-z', filename],
                                    stdout=subprocess.PIPE)
         status = process.wait()
         if status != 0:
-            return None, None
+            return []
         output = process.stdout.readline()
         process.stdout.close()
         if output.startswith('%s: ' % filename):
             output = output[len(filename) + 2:]
-        results = [None, None]
-        for index, mapping in enumerate((cls.magic_mime_map,
-                                         cls.magic_encoding_map)):
-            for regexp, result in mapping.items():
-                if regexp.search(output):
-                    results[index] = result
-                    break
-        return results
+        mimes = cls.magic_map_matches(output, cls.magic_mime_map)
+        encodings = cls.magic_map_matches(output, cls.magic_encoding_map)
+        if mimes and not encodings:
+            encodings = [None]
+        elif encodings and not mimes:
+            mimes = ['compress']
+        return [(m, e) for m in mimes for e in encodings]
     try_by_magic = classmethod(try_by_magic)
 
     def try_by_extension(cls, filename):
         parts = filename.rsplit('.', 2)[1:]
+        results = []
         while parts:
-            try:
-                return cls.extension_map['.'.join(parts)]
-            except KeyError:
-                del parts[0]
-        return [None, None]
+            results.extend(cls.extension_map.get('.'.join(parts), []))
+            del parts[0]
+        return results
     try_by_extension = classmethod(try_by_extension)
 
 
@@ -715,6 +763,7 @@
                                                extractor.content_name)
         for handler in self.handlers:
             if handler.can_handle(extractor.content_type, self.options):
+                logger.debug("using %s handler" % (handler.__name__,))
                 self.current_handler = handler(extractor, self.options)
                 break
 
@@ -789,7 +838,7 @@
                           help="don't ask how to handle special cases")
         parser.add_option('-m', '--metadata', dest='metadata',
                           action='store_true', default=False,
-                          help="extract metadata from a .deb/.gem/etc.")
+                          help="extract metadata from a .deb/.gem")
         self.options, filenames = parser.parse_args(arguments)
         if not filenames:
             parser.error("you did not list any archives")
Binary file tests/test-onefile.tar.gz has changed
--- a/tests/tests.yml	Thu Nov 22 22:37:40 2007 -0500
+++ b/tests/tests.yml	Fri Nov 23 11:24:58 2007 -0500
@@ -394,3 +394,13 @@
   filenames: /dev/null
   error: true
   grep: ERROR
+
+- name: extract an archive with one file here
+  filenames: test-onefile.tar.gz
+  options: ""
+  grep: "one entry: "
+  input: |
+    h
+    n
+  baseline: |
+    tar -zxf $1

mercurial