From 404bbb2a0fc6442dbe277e877de64f4ee0c4eebb Mon Sep 17 00:00:00 2001
From: Andrey Bondarenko <abone27@mail.ru>
Date: Wed, 5 Nov 2014 01:25:10 +0500
Subject: pybootchartgui: _parse_proc_ps_log rewrite with iterator

Iterators use much less memory, so larger bootcharts
may be processed without triggering OOM killer and
massive swapping.

On a (big) 11MB tarball this will have a performance penalty
of about ~10% but consuming half the memory.

Before:
23.50user 1.20system 0:24.97elapsed 98%CPU (0avgtext+0avgdata 770048maxresident)k

After:
26.78user 0.44system 0:27.24elapsed 99%CPU (0avgtext+0avgdata 321192maxresident)k

Signed-off-by: Armin Kuster <akuster808@gmail.com>
---
 scripts/pybootchartgui/pybootchartgui/parsing.py | 46 ++++++++++++++++++++----
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/scripts/pybootchartgui/pybootchartgui/parsing.py b/scripts/pybootchartgui/pybootchartgui/parsing.py
index 0b5063b4f3..97d28724a4 100644
--- a/scripts/pybootchartgui/pybootchartgui/parsing.py
+++ b/scripts/pybootchartgui/pybootchartgui/parsing.py
@@ -13,6 +13,9 @@
 #  You should have received a copy of the GNU General Public License
 #  along with pybootchartgui. If not, see <http://www.gnu.org/licenses/>.
 
+
+import codecs
+import itertools
 import os
 import string
 import re
@@ -269,6 +272,30 @@ def _parse_headers(file):
         return headers, last
     return reduce(parse, file.read().decode('utf-8').split('\n'), (defaultdict(str),''))[0]
 
+def _iter_parse_timed_blocks(file):
+    """Parses (ie., splits) a file into so-called timed-blocks.
+
+    A timed-block consists of a timestamp on a line by itself followed
+    by zero or more lines of data for that point in time.
+
+    Return an iterator over timed blocks, so there is no need to keep
+    all the data in memory.
+    """
+    def parse(block):
+        lines = block
+        if not lines:
+            raise ParseError('expected a timed-block consisting a timestamp followed by data lines')
+        try:
+            return (int(lines[0]), lines[1:])
+        except ValueError:
+            raise ParseError("expected a timed-block, but timestamp '%s' is not an integer" % lines[0])
+    data = codecs.iterdecode(file, "utf-8")
+    block = [line.strip() for line in itertools.takewhile(lambda s: s != "\n", data)]
+    while block:
+        if block and not block[-1].endswith(" not running\n"):
+            yield parse(block)
+        block = [line.strip() for line in itertools.takewhile(lambda s: s != "\n", data)]
+
 def _parse_timed_blocks(file):
     """Parses (ie., splits) a file into so-called timed-blocks. A
     timed-block consists of a timestamp on a line by itself followed
@@ -292,10 +319,18 @@ def _parse_proc_ps_log(writer, file):
      *  cutime, cstime, priority, nice, 0, itrealvalue, starttime, vsize, rss, rlim, startcode, endcode, startstack,
      *  kstkesp, kstkeip}
     """
+    timed_blocks = _iter_parse_timed_blocks(file)
+    try:
+        first_timed_block = next(timed_blocks)
+        startTime = first_timed_block[0]
+    except StopIteration:
+        return None
+
     processMap = {}
     ltime = 0
-    timed_blocks = _parse_timed_blocks(file)
-    for time, lines in timed_blocks:
+    timed_blocks_count = 0
+    for time, lines in itertools.chain((first_timed_block,), timed_blocks):
+        timed_blocks_count += 1
         for line in lines:
             if not line: continue
             tokens = line.split(' ')
@@ -325,13 +360,12 @@ def _parse_proc_ps_log(writer, file):
             process.last_sys_cpu_time = sysCpu
         ltime = time
 
-    if len (timed_blocks) < 2:
+    if timed_blocks_count < 2:
         return None
 
-    startTime = timed_blocks[0][0]
-    avgSampleLength = (ltime - startTime)/(len (timed_blocks) - 1)
+    avgSampleLength = (ltime - startTime)/(timed_blocks_count - 1)
 
-    return ProcessStats (writer, processMap, len (timed_blocks), avgSampleLength, startTime, ltime)
+    return ProcessStats (writer, processMap, timed_blocks_count, avgSampleLength, startTime, ltime)
 
 def _parse_taskstats_log(writer, file):
     """
-- 
cgit 1.2.3-korg