diff options
-rw-r--r-- | lib/bb/cache.py | 12 | ||||
-rw-r--r-- | lib/bb/codeparser.py | 143 |
2 files changed, 109 insertions, 46 deletions
diff --git a/lib/bb/cache.py b/lib/bb/cache.py index c7f3b7ab7..f892d7dc3 100644 --- a/lib/bb/cache.py +++ b/lib/bb/cache.py @@ -764,16 +764,6 @@ class MultiProcessCache(object): self.cachedata = data - def internSet(self, items): - new = set() - for i in items: - new.add(intern(i)) - return new - - def compress_keys(self, data): - # Override in subclasses if desired - return - def create_cachedata(self): data = [{}] return data @@ -833,8 +823,6 @@ class MultiProcessCache(object): self.merge_data(extradata, data) os.unlink(f) - self.compress_keys(data) - with open(self.cachefile, "wb") as f: p = pickle.Pickler(f, -1) p.dump([data, self.__class__.CACHE_VERSION]) diff --git a/lib/bb/codeparser.py b/lib/bb/codeparser.py index 2e8de12f3..8b8f91a76 100644 --- a/lib/bb/codeparser.py +++ b/lib/bb/codeparser.py @@ -33,9 +33,82 @@ def check_indent(codestr): return codestr +# Basically pickle, in python 2.7.3 at least, does badly with data duplication +# upon pickling and unpickling. Combine this with duplicate objects and things +# are a mess. +# +# When the sets are originally created, python calls intern() on the set keys +# which significantly improves memory usage. Sadly the pickle/unpickle process +# doesn't call intern() on the keys and results in the same strings being duplicated +# in memory. This also means pickle will save the same string multiple times in +# the cache file. +# +# By having shell and python cacheline objects with setstate/getstate, we force +# the object creation through our own routine where we can call intern (via internSet). +# +# We also use hashable frozensets and ensure we use references to these so that +# duplicates can be removed, both in memory and in the resulting pickled data. +# +# By playing these games, the size of the cache file shrinks dramatically +# meaning faster load times and the reloaded cache files also consume much less +# memory. Smaller cache files, faster load times and lower memory usage is good. +# +# A custom getstate/setstate using tuples is actually worth 15% cachesize by +# avoiding duplication of the attribute names! + +class SetCache(object): + def __init__(self): + self.setcache = {} + + def internSet(self, items): + + new = [] + for i in items: + new.append(intern(i)) + s = frozenset(new) + if hash(s) in self.setcache: + return self.setcache[hash(s)] + self.setcache[hash(s)] = s + return s + +codecache = SetCache() + +class pythonCacheLine(object): + def __init__(self, refs, execs, contains): + self.refs = codecache.internSet(refs) + self.execs = codecache.internSet(execs) + self.contains = {} + for c in contains: + self.contains[c] = codecache.internSet(contains[c]) + + def __getstate__(self): + return (self.refs, self.execs, self.contains) + + def __setstate__(self, state): + (refs, execs, contains) = state + self.__init__(refs, execs, contains) + def __hash__(self): + l = (hash(self.refs), hash(self.execs)) + for c in sorted(self.contains.keys()): + l = l + (c, hash(self.contains[c])) + return hash(l) + +class shellCacheLine(object): + def __init__(self, execs): + self.execs = codecache.internSet(execs) + + def __getstate__(self): + return (self.execs) + + def __setstate__(self, state): + (execs) = state + self.__init__(execs) + def __hash__(self): + return hash(self.execs) + class CodeParserCache(MultiProcessCache): cache_file_name = "bb_codeparser.dat" - CACHE_VERSION = 6 + CACHE_VERSION = 7 def __init__(self): MultiProcessCache.__init__(self) @@ -44,6 +117,27 @@ class CodeParserCache(MultiProcessCache): self.pythoncacheextras = self.cachedata_extras[0] self.shellcacheextras = self.cachedata_extras[1] + # To avoid duplication in the codeparser cache, keep + # a lookup of hashes of objects we already have + self.pythoncachelines = {} + self.shellcachelines = {} + + def newPythonCacheLine(self, refs, execs, contains): + cacheline = pythonCacheLine(refs, execs, contains) + h = hash(cacheline) + if h in self.pythoncachelines: + return self.pythoncachelines[h] + self.pythoncachelines[h] = cacheline + return cacheline + + def newShellCacheLine(self, execs): + cacheline = shellCacheLine(execs) + h = hash(cacheline) + if h in self.shellcachelines: + return self.shellcachelines[h] + self.shellcachelines[h] = cacheline + return cacheline + def init_cache(self, d): MultiProcessCache.init_cache(self, d) @@ -51,25 +145,6 @@ class CodeParserCache(MultiProcessCache): self.pythoncache = self.cachedata[0] self.shellcache = self.cachedata[1] - def compress_keys(self, data): - # When the dicts are originally created, python calls intern() on the set keys - # which significantly improves memory usage. Sadly the pickle/unpickle process - # doesn't call intern() on the keys and results in the same strings being duplicated - # in memory. This also means pickle will save the same string multiple times in - # the cache file. By interning the data here, the cache file shrinks dramatically - # meaning faster load times and the reloaded cache files also consume much less - # memory. This is worth any performance hit from this loops and the use of the - # intern() data storage. - # Python 3.x may behave better in this area - for h in data[0]: - data[0][h]["refs"] = self.internSet(data[0][h]["refs"]) - data[0][h]["execs"] = self.internSet(data[0][h]["execs"]) - for k in data[0][h]["contains"]: - data[0][h]["contains"][k] = self.internSet(data[0][h]["contains"][k]) - for h in data[1]: - data[1][h]["execs"] = self.internSet(data[1][h]["execs"]) - return - def create_cachedata(self): data = [{}, {}] return data @@ -168,15 +243,19 @@ class PythonParser(): h = hash(str(node)) if h in codeparsercache.pythoncache: - self.references = codeparsercache.pythoncache[h]["refs"] - self.execs = codeparsercache.pythoncache[h]["execs"] - self.contains = codeparsercache.pythoncache[h]["contains"] + self.references = set(codeparsercache.pythoncache[h].refs) + self.execs = set(codeparsercache.pythoncache[h].execs) + self.contains = {} + for i in codeparsercache.pythoncache[h].contains: + self.contains[i] = set(codeparsercache.pythoncache[h].contains[i]) return if h in codeparsercache.pythoncacheextras: - self.references = codeparsercache.pythoncacheextras[h]["refs"] - self.execs = codeparsercache.pythoncacheextras[h]["execs"] - self.contains = codeparsercache.pythoncacheextras[h]["contains"] + self.references = set(codeparsercache.pythoncacheextras[h].refs) + self.execs = set(codeparsercache.pythoncacheextras[h].execs) + self.contains = {} + for i in codeparsercache.pythoncacheextras[h].contains: + self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i]) return code = compile(check_indent(str(node)), "<string>", "exec", @@ -188,10 +267,7 @@ class PythonParser(): self.execs.update(self.var_execs) - codeparsercache.pythoncacheextras[h] = {} - codeparsercache.pythoncacheextras[h]["refs"] = self.references - codeparsercache.pythoncacheextras[h]["execs"] = self.execs - codeparsercache.pythoncacheextras[h]["contains"] = self.contains + codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains) class ShellParser(): def __init__(self, name, log): @@ -210,18 +286,17 @@ class ShellParser(): h = hash(str(value)) if h in codeparsercache.shellcache: - self.execs = codeparsercache.shellcache[h]["execs"] + self.execs = set(codeparsercache.shellcache[h].execs) return self.execs if h in codeparsercache.shellcacheextras: - self.execs = codeparsercache.shellcacheextras[h]["execs"] + self.execs = set(codeparsercache.shellcacheextras[h].execs) return self.execs self._parse_shell(value) self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) - codeparsercache.shellcacheextras[h] = {} - codeparsercache.shellcacheextras[h]["execs"] = self.execs + codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs) return self.execs |