2 files changed, 109 insertions, 46 deletions
diff --git a/lib/bb/cache.py b/lib/bb/cache.py
index c7f3b7ab7..f892d7dc3 100644
--- a/lib/bb/cache.py
+++ b/lib/bb/cache.py
@@ -764,16 +764,6 @@ class MultiProcessCache(object):
 
         self.cachedata = data
 
-    def internSet(self, items):
-        new = set()
-        for i in items:
-            new.add(intern(i))
-        return new
-
-    def compress_keys(self, data):
-        # Override in subclasses if desired
-        return
-
     def create_cachedata(self):
         data = [{}]
         return data
@@ -833,8 +823,6 @@ class MultiProcessCache(object):
             self.merge_data(extradata, data)
             os.unlink(f)
 
-        self.compress_keys(data)
-
         with open(self.cachefile, "wb") as f:
             p = pickle.Pickler(f, -1)
             p.dump([data, self.__class__.CACHE_VERSION])
diff --git a/lib/bb/codeparser.py b/lib/bb/codeparser.py
index 2e8de12f3..8b8f91a76 100644
--- a/lib/bb/codeparser.py
+++ b/lib/bb/codeparser.py
@@ -33,9 +33,82 @@ def check_indent(codestr):
     return codestr
 
 
+# Basically pickle, in python 2.7.3 at least, does badly with data duplication 
+# upon pickling and unpickling. Combine this with duplicate objects and things
+# are a mess.
+#
+# When the sets are originally created, python calls intern() on the set keys
+# which significantly improves memory usage. Sadly the pickle/unpickle process
+# doesn't call intern() on the keys and results in the same strings being duplicated
+# in memory. This also means pickle will save the same string multiple times in
+# the cache file.
+#
+# By having shell and python cacheline objects with setstate/getstate, we force
+# the object creation through our own routine where we can call intern (via internSet).
+#
+# We also use hashable frozensets and ensure we use references to these so that
+# duplicates can be removed, both in memory and in the resulting pickled data.
+#
+# By playing these games, the size of the cache file shrinks dramatically
+# meaning faster load times and the reloaded cache files also consume much less
+# memory. Smaller cache files, faster load times and lower memory usage is good.
+#
+# A custom getstate/setstate using tuples is actually worth 15% cachesize by
+# avoiding duplication of the attribute names!
+
+class SetCache(object):
+    def __init__(self):
+        self.setcache = {}
+
+    def internSet(self, items):
+        
+        new = []
+        for i in items:
+            new.append(intern(i))
+        s = frozenset(new)
+        if hash(s) in self.setcache:
+            return self.setcache[hash(s)]
+        self.setcache[hash(s)] = s
+        return s
+
+codecache = SetCache()
+
+class pythonCacheLine(object):
+    def __init__(self, refs, execs, contains):
+        self.refs = codecache.internSet(refs)
+        self.execs = codecache.internSet(execs)
+        self.contains = {}
+        for c in contains:
+            self.contains[c] = codecache.internSet(contains[c])
+
+    def __getstate__(self):
+        return (self.refs, self.execs, self.contains)
+
+    def __setstate__(self, state):
+        (refs, execs, contains) = state
+        self.__init__(refs, execs, contains)
+    def __hash__(self):
+        l = (hash(self.refs), hash(self.execs))
+        for c in sorted(self.contains.keys()):
+            l = l + (c, hash(self.contains[c]))
+        return hash(l)
+
+class shellCacheLine(object):
+    def __init__(self, execs):
+        self.execs = codecache.internSet(execs)
+
+    def __getstate__(self):
+        return (self.execs)
+
+    def __setstate__(self, state):
+        (execs) = state
+        self.__init__(execs)
+    def __hash__(self):
+        return hash(self.execs)
+
 class CodeParserCache(MultiProcessCache):
     cache_file_name = "bb_codeparser.dat"
-    CACHE_VERSION = 6
+    CACHE_VERSION = 7
 
     def __init__(self):
         MultiProcessCache.__init__(self)
@@ -44,6 +117,27 @@ class CodeParserCache(MultiProcessCache):
         self.pythoncacheextras = self.cachedata_extras[0]
         self.shellcacheextras = self.cachedata_extras[1]
 
+        # To avoid duplication in the codeparser cache, keep
+        # a lookup of hashes of objects we already have
+        self.pythoncachelines = {}
+        self.shellcachelines = {}
+
+    def newPythonCacheLine(self, refs, execs, contains):
+        cacheline = pythonCacheLine(refs, execs, contains)
+        h = hash(cacheline)
+        if h in self.pythoncachelines:
+            return self.pythoncachelines[h]
+        self.pythoncachelines[h] = cacheline
+        return cacheline
+
+    def newShellCacheLine(self, execs):
+        cacheline = shellCacheLine(execs)
+        h = hash(cacheline)
+        if h in self.shellcachelines:
+            return self.shellcachelines[h]
+        self.shellcachelines[h] = cacheline
+        return cacheline
+
     def init_cache(self, d):
         MultiProcessCache.init_cache(self, d)
 
@@ -51,25 +145,6 @@ class CodeParserCache(MultiProcessCache):
         self.pythoncache = self.cachedata[0]
         self.shellcache = self.cachedata[1]
 
-    def compress_keys(self, data):
-        # When the dicts are originally created, python calls intern() on the set keys
-        # which significantly improves memory usage. Sadly the pickle/unpickle process
-        # doesn't call intern() on the keys and results in the same strings being duplicated
-        # in memory. This also means pickle will save the same string multiple times in
-        # the cache file. By interning the data here, the cache file shrinks dramatically
-        # meaning faster load times and the reloaded cache files also consume much less
-        # memory. This is worth any performance hit from this loops and the use of the
-        # intern() data storage.
-        # Python 3.x may behave better in this area
-        for h in data[0]:
-            data[0][h]["refs"] = self.internSet(data[0][h]["refs"])
-            data[0][h]["execs"] = self.internSet(data[0][h]["execs"])
-            for k in data[0][h]["contains"]:
-                data[0][h]["contains"][k] = self.internSet(data[0][h]["contains"][k])
-        for h in data[1]:
-            data[1][h]["execs"] = self.internSet(data[1][h]["execs"])
-        return
-
     def create_cachedata(self):
         data = [{}, {}]
         return data
@@ -168,15 +243,19 @@ class PythonParser():
         h = hash(str(node))
 
         if h in codeparsercache.pythoncache:
-            self.references = codeparsercache.pythoncache[h]["refs"]
-            self.execs = codeparsercache.pythoncache[h]["execs"]
-            self.contains = codeparsercache.pythoncache[h]["contains"]
+            self.references = set(codeparsercache.pythoncache[h].refs)
+            self.execs = set(codeparsercache.pythoncache[h].execs)
+            self.contains = {}
+            for i in codeparsercache.pythoncache[h].contains:
+                self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
             return
 
         if h in codeparsercache.pythoncacheextras:
-            self.references = codeparsercache.pythoncacheextras[h]["refs"]
-            self.execs = codeparsercache.pythoncacheextras[h]["execs"]
-            self.contains = codeparsercache.pythoncacheextras[h]["contains"]
+            self.references = set(codeparsercache.pythoncacheextras[h].refs)
+            self.execs = set(codeparsercache.pythoncacheextras[h].execs)
+            self.contains = {}
+            for i in codeparsercache.pythoncacheextras[h].contains:
+                self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
             return
 
         code = compile(check_indent(str(node)), "<string>", "exec",
@@ -188,10 +267,7 @@ class PythonParser():
 
         self.execs.update(self.var_execs)
 
-        codeparsercache.pythoncacheextras[h] = {}
-        codeparsercache.pythoncacheextras[h]["refs"] = self.references
-        codeparsercache.pythoncacheextras[h]["execs"] = self.execs
-        codeparsercache.pythoncacheextras[h]["contains"] = self.contains
+        codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains)
 
 class ShellParser():
     def __init__(self, name, log):
@@ -210,18 +286,17 @@ class ShellParser():
         h = hash(str(value))
 
         if h in codeparsercache.shellcache:
-            self.execs = codeparsercache.shellcache[h]["execs"]
+            self.execs = set(codeparsercache.shellcache[h].execs)
             return self.execs
 
         if h in codeparsercache.shellcacheextras:
-            self.execs = codeparsercache.shellcacheextras[h]["execs"]
+            self.execs = set(codeparsercache.shellcacheextras[h].execs)
             return self.execs
 
         self._parse_shell(value)
         self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
 
-        codeparsercache.shellcacheextras[h] = {}
-        codeparsercache.shellcacheextras[h]["execs"] = self.execs
+        codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
 
         return self.execs