summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/bb/cache.py12
-rw-r--r--lib/bb/codeparser.py143
2 files changed, 109 insertions, 46 deletions
diff --git a/lib/bb/cache.py b/lib/bb/cache.py
index c7f3b7ab7..f892d7dc3 100644
--- a/lib/bb/cache.py
+++ b/lib/bb/cache.py
@@ -764,16 +764,6 @@ class MultiProcessCache(object):
self.cachedata = data
- def internSet(self, items):
- new = set()
- for i in items:
- new.add(intern(i))
- return new
-
- def compress_keys(self, data):
- # Override in subclasses if desired
- return
-
def create_cachedata(self):
data = [{}]
return data
@@ -833,8 +823,6 @@ class MultiProcessCache(object):
self.merge_data(extradata, data)
os.unlink(f)
- self.compress_keys(data)
-
with open(self.cachefile, "wb") as f:
p = pickle.Pickler(f, -1)
p.dump([data, self.__class__.CACHE_VERSION])
diff --git a/lib/bb/codeparser.py b/lib/bb/codeparser.py
index 2e8de12f3..8b8f91a76 100644
--- a/lib/bb/codeparser.py
+++ b/lib/bb/codeparser.py
@@ -33,9 +33,82 @@ def check_indent(codestr):
return codestr
+# Basically pickle, in python 2.7.3 at least, does badly with data duplication
+# upon pickling and unpickling. Combine this with duplicate objects and things
+# are a mess.
+#
+# When the sets are originally created, python calls intern() on the set keys
+# which significantly improves memory usage. Sadly the pickle/unpickle process
+# doesn't call intern() on the keys and results in the same strings being duplicated
+# in memory. This also means pickle will save the same string multiple times in
+# the cache file.
+#
+# By having shell and python cacheline objects with setstate/getstate, we force
+# the object creation through our own routine where we can call intern (via internSet).
+#
+# We also use hashable frozensets and ensure we use references to these so that
+# duplicates can be removed, both in memory and in the resulting pickled data.
+#
+# By playing these games, the size of the cache file shrinks dramatically
+# meaning faster load times and the reloaded cache files also consume much less
+# memory. Smaller cache files, faster load times and lower memory usage is good.
+#
+# A custom getstate/setstate using tuples is actually worth 15% cachesize by
+# avoiding duplication of the attribute names!
+
+class SetCache(object):
+ def __init__(self):
+ self.setcache = {}
+
+ def internSet(self, items):
+
+ new = []
+ for i in items:
+ new.append(intern(i))
+ s = frozenset(new)
+ if hash(s) in self.setcache:
+ return self.setcache[hash(s)]
+ self.setcache[hash(s)] = s
+ return s
+
+codecache = SetCache()
+
+class pythonCacheLine(object):
+ def __init__(self, refs, execs, contains):
+ self.refs = codecache.internSet(refs)
+ self.execs = codecache.internSet(execs)
+ self.contains = {}
+ for c in contains:
+ self.contains[c] = codecache.internSet(contains[c])
+
+ def __getstate__(self):
+ return (self.refs, self.execs, self.contains)
+
+ def __setstate__(self, state):
+ (refs, execs, contains) = state
+ self.__init__(refs, execs, contains)
+ def __hash__(self):
+ l = (hash(self.refs), hash(self.execs))
+ for c in sorted(self.contains.keys()):
+ l = l + (c, hash(self.contains[c]))
+ return hash(l)
+
+class shellCacheLine(object):
+ def __init__(self, execs):
+ self.execs = codecache.internSet(execs)
+
+ def __getstate__(self):
+ return (self.execs)
+
+ def __setstate__(self, state):
+ (execs) = state
+ self.__init__(execs)
+ def __hash__(self):
+ return hash(self.execs)
+
class CodeParserCache(MultiProcessCache):
cache_file_name = "bb_codeparser.dat"
- CACHE_VERSION = 6
+ CACHE_VERSION = 7
def __init__(self):
MultiProcessCache.__init__(self)
@@ -44,6 +117,27 @@ class CodeParserCache(MultiProcessCache):
self.pythoncacheextras = self.cachedata_extras[0]
self.shellcacheextras = self.cachedata_extras[1]
+ # To avoid duplication in the codeparser cache, keep
+ # a lookup of hashes of objects we already have
+ self.pythoncachelines = {}
+ self.shellcachelines = {}
+
+ def newPythonCacheLine(self, refs, execs, contains):
+ cacheline = pythonCacheLine(refs, execs, contains)
+ h = hash(cacheline)
+ if h in self.pythoncachelines:
+ return self.pythoncachelines[h]
+ self.pythoncachelines[h] = cacheline
+ return cacheline
+
+ def newShellCacheLine(self, execs):
+ cacheline = shellCacheLine(execs)
+ h = hash(cacheline)
+ if h in self.shellcachelines:
+ return self.shellcachelines[h]
+ self.shellcachelines[h] = cacheline
+ return cacheline
+
def init_cache(self, d):
MultiProcessCache.init_cache(self, d)
@@ -51,25 +145,6 @@ class CodeParserCache(MultiProcessCache):
self.pythoncache = self.cachedata[0]
self.shellcache = self.cachedata[1]
- def compress_keys(self, data):
- # When the dicts are originally created, python calls intern() on the set keys
- # which significantly improves memory usage. Sadly the pickle/unpickle process
- # doesn't call intern() on the keys and results in the same strings being duplicated
- # in memory. This also means pickle will save the same string multiple times in
- # the cache file. By interning the data here, the cache file shrinks dramatically
- # meaning faster load times and the reloaded cache files also consume much less
- # memory. This is worth any performance hit from this loops and the use of the
- # intern() data storage.
- # Python 3.x may behave better in this area
- for h in data[0]:
- data[0][h]["refs"] = self.internSet(data[0][h]["refs"])
- data[0][h]["execs"] = self.internSet(data[0][h]["execs"])
- for k in data[0][h]["contains"]:
- data[0][h]["contains"][k] = self.internSet(data[0][h]["contains"][k])
- for h in data[1]:
- data[1][h]["execs"] = self.internSet(data[1][h]["execs"])
- return
-
def create_cachedata(self):
data = [{}, {}]
return data
@@ -168,15 +243,19 @@ class PythonParser():
h = hash(str(node))
if h in codeparsercache.pythoncache:
- self.references = codeparsercache.pythoncache[h]["refs"]
- self.execs = codeparsercache.pythoncache[h]["execs"]
- self.contains = codeparsercache.pythoncache[h]["contains"]
+ self.references = set(codeparsercache.pythoncache[h].refs)
+ self.execs = set(codeparsercache.pythoncache[h].execs)
+ self.contains = {}
+ for i in codeparsercache.pythoncache[h].contains:
+ self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
return
if h in codeparsercache.pythoncacheextras:
- self.references = codeparsercache.pythoncacheextras[h]["refs"]
- self.execs = codeparsercache.pythoncacheextras[h]["execs"]
- self.contains = codeparsercache.pythoncacheextras[h]["contains"]
+ self.references = set(codeparsercache.pythoncacheextras[h].refs)
+ self.execs = set(codeparsercache.pythoncacheextras[h].execs)
+ self.contains = {}
+ for i in codeparsercache.pythoncacheextras[h].contains:
+ self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
return
code = compile(check_indent(str(node)), "<string>", "exec",
@@ -188,10 +267,7 @@ class PythonParser():
self.execs.update(self.var_execs)
- codeparsercache.pythoncacheextras[h] = {}
- codeparsercache.pythoncacheextras[h]["refs"] = self.references
- codeparsercache.pythoncacheextras[h]["execs"] = self.execs
- codeparsercache.pythoncacheextras[h]["contains"] = self.contains
+ codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains)
class ShellParser():
def __init__(self, name, log):
@@ -210,18 +286,17 @@ class ShellParser():
h = hash(str(value))
if h in codeparsercache.shellcache:
- self.execs = codeparsercache.shellcache[h]["execs"]
+ self.execs = set(codeparsercache.shellcache[h].execs)
return self.execs
if h in codeparsercache.shellcacheextras:
- self.execs = codeparsercache.shellcacheextras[h]["execs"]
+ self.execs = set(codeparsercache.shellcacheextras[h].execs)
return self.execs
self._parse_shell(value)
self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
- codeparsercache.shellcacheextras[h] = {}
- codeparsercache.shellcacheextras[h]["execs"] = self.execs
+ codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
return self.execs