From 404bbb2a0fc6442dbe277e877de64f4ee0c4eebb Mon Sep 17 00:00:00 2001 From: Andrey Bondarenko Date: Wed, 5 Nov 2014 01:25:10 +0500 Subject: pybootchartgui: _parse_proc_ps_log rewrite with iterator Iterators use much less memory, so larger bootcharts may be processed without triggering OOM killer and massive swapping. On a (big) 11MB tarball this will have a performance penalty of about ~10% but consuming half the memory. Before: 23.50user 1.20system 0:24.97elapsed 98%CPU (0avgtext+0avgdata 770048maxresident)k After: 26.78user 0.44system 0:27.24elapsed 99%CPU (0avgtext+0avgdata 321192maxresident)k Signed-off-by: Armin Kuster --- scripts/pybootchartgui/pybootchartgui/parsing.py | 46 ++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/scripts/pybootchartgui/pybootchartgui/parsing.py b/scripts/pybootchartgui/pybootchartgui/parsing.py index 0b5063b4f3..97d28724a4 100644 --- a/scripts/pybootchartgui/pybootchartgui/parsing.py +++ b/scripts/pybootchartgui/pybootchartgui/parsing.py @@ -13,6 +13,9 @@ # You should have received a copy of the GNU General Public License # along with pybootchartgui. If not, see . + +import codecs +import itertools import os import string import re @@ -269,6 +272,30 @@ def _parse_headers(file): return headers, last return reduce(parse, file.read().decode('utf-8').split('\n'), (defaultdict(str),''))[0] +def _iter_parse_timed_blocks(file): + """Parses (ie., splits) a file into so-called timed-blocks. + + A timed-block consists of a timestamp on a line by itself followed + by zero or more lines of data for that point in time. + + Return an iterator over timed blocks, so there is no need to keep + all the data in memory. + """ + def parse(block): + lines = block + if not lines: + raise ParseError('expected a timed-block consisting a timestamp followed by data lines') + try: + return (int(lines[0]), lines[1:]) + except ValueError: + raise ParseError("expected a timed-block, but timestamp '%s' is not an integer" % lines[0]) + data = codecs.iterdecode(file, "utf-8") + block = [line.strip() for line in itertools.takewhile(lambda s: s != "\n", data)] + while block: + if block and not block[-1].endswith(" not running\n"): + yield parse(block) + block = [line.strip() for line in itertools.takewhile(lambda s: s != "\n", data)] + def _parse_timed_blocks(file): """Parses (ie., splits) a file into so-called timed-blocks. A timed-block consists of a timestamp on a line by itself followed @@ -292,10 +319,18 @@ def _parse_proc_ps_log(writer, file): * cutime, cstime, priority, nice, 0, itrealvalue, starttime, vsize, rss, rlim, startcode, endcode, startstack, * kstkesp, kstkeip} """ + timed_blocks = _iter_parse_timed_blocks(file) + try: + first_timed_block = next(timed_blocks) + startTime = first_timed_block[0] + except StopIteration: + return None + processMap = {} ltime = 0 - timed_blocks = _parse_timed_blocks(file) - for time, lines in timed_blocks: + timed_blocks_count = 0 + for time, lines in itertools.chain((first_timed_block,), timed_blocks): + timed_blocks_count += 1 for line in lines: if not line: continue tokens = line.split(' ') @@ -325,13 +360,12 @@ def _parse_proc_ps_log(writer, file): process.last_sys_cpu_time = sysCpu ltime = time - if len (timed_blocks) < 2: + if timed_blocks_count < 2: return None - startTime = timed_blocks[0][0] - avgSampleLength = (ltime - startTime)/(len (timed_blocks) - 1) + avgSampleLength = (ltime - startTime)/(timed_blocks_count - 1) - return ProcessStats (writer, processMap, len (timed_blocks), avgSampleLength, startTime, ltime) + return ProcessStats (writer, processMap, timed_blocks_count, avgSampleLength, startTime, ltime) def _parse_taskstats_log(writer, file): """ -- cgit 1.2.3-korg