From 592ee222a1c6da42925fb56801f226884b6724ec Mon Sep 17 00:00:00 2001 From: Etienne Cordonnier Date: Wed, 1 Feb 2023 15:19:00 +0100 Subject: siggen: Fix inefficient string concatenation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As discussed in https://stackoverflow.com/a/4435752/1710392 , CPython has an optimization for statements in the form "a = a + b" or "a += b". It seems that this line does not get optimized, because it has a form a = a + b + c: data = data + "./" + f.split("/./")[1] For that reason, it does a copy of data for each iteration, potentially copying megabytes of data for each iteration. Changing this line causes SignatureGeneratorBasic::get_taskhash to take 0.06 seconds instead of 45 seconds on my test setup where SRC_URI points to a big directory. Note that PEP8 recommends explicitely not to use this optimization which is specific to CPython: "do not rely on CPython’s efficient implementation of in-place string concatenation for statements in the form a += b or a = a + b" However, the PEP8 recommended form using "join()" also does not avoid the copy and takes 45 seconds in my test setup: data = ''.join((data, "./", f.split("/./")[1])) I have changed the other lines to also use += for consistency only, however those were in the form a = a + b and were optimized already. Co-authored-by: JJ Robertson Signed-off-by: Etienne Cordonnier Signed-off-by: Richard Purdie (cherry picked from commit 195750f2ca355e29d51219c58ecb2c1d83692717) Signed-off-by: Steve Sakoman --- lib/bb/siggen.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/bb/siggen.py b/lib/bb/siggen.py index 07bb52945..dd7039e5d 100644 --- a/lib/bb/siggen.py +++ b/lib/bb/siggen.py @@ -332,19 +332,19 @@ class SignatureGeneratorBasic(SignatureGenerator): data = self.basehash[tid] for dep in self.runtaskdeps[tid]: - data = data + self.get_unihash(dep) + data += self.get_unihash(dep) for (f, cs) in self.file_checksum_values[tid]: if cs: if "/./" in f: - data = data + "./" + f.split("/./")[1] - data = data + cs + data += "./" + f.split("/./")[1] + data += cs if tid in self.taints: if self.taints[tid].startswith("nostamp:"): - data = data + self.taints[tid][8:] + data += self.taints[tid][8:] else: - data = data + self.taints[tid] + data += self.taints[tid] h = hashlib.sha256(data.encode("utf-8")).hexdigest() self.taskhash[tid] = h -- cgit 1.2.3-korg