From ddc7bb9064042a0d5da9546eaf6ff888dca63f0c Mon Sep 17 00:00:00 2001
From: Junegunn Choi <junegunn.c@gmail.com>
Date: Sun, 14 Aug 2016 01:53:06 +0900
Subject: [PATCH] [perf] Optimize AWK-style tokenizer for --nth

Approx. 50% less memory footprint and 40% improvement in query time
---
 src/tokenizer.go       | 44 ++++++++++++++++++++++++++++--------------
 src/util/chars.go      |  7 +++++++
 src/util/chars_test.go | 21 ++++++++++++++++++++
 src/util/util.go       | 24 -----------------------
 src/util/util_test.go  | 20 -------------------
 5 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/src/tokenizer.go b/src/tokenizer.go
index 05b890a2..eec19898 100644
--- a/src/tokenizer.go
+++ b/src/tokenizer.go
@@ -97,10 +97,11 @@ const (
 func awkTokenizer(input util.Chars) ([]util.Chars, int) {
 	// 9, 32
 	ret := []util.Chars{}
-	str := []rune{}
 	prefixLength := 0
 	state := awkNil
 	numChars := input.Length()
+	begin := 0
+	end := 0
 	for idx := 0; idx < numChars; idx++ {
 		r := input.Get(idx)
 		white := r == 9 || r == 32
@@ -109,26 +110,24 @@ func awkTokenizer(input util.Chars) ([]util.Chars, int) {
 			if white {
 				prefixLength++
 			} else {
-				state = awkBlack
-				str = append(str, r)
+				state, begin, end = awkBlack, idx, idx+1
 			}
 		case awkBlack:
-			str = append(str, r)
+			end = idx + 1
 			if white {
 				state = awkWhite
 			}
 		case awkWhite:
 			if white {
-				str = append(str, r)
+				end = idx + 1
 			} else {
-				ret = append(ret, util.RunesToChars(str))
-				state = awkBlack
-				str = []rune{r}
+				ret = append(ret, input.Slice(begin, end))
+				state, begin, end = awkBlack, idx, idx+1
 			}
 		}
 	}
-	if len(str) > 0 {
-		ret = append(ret, util.RunesToChars(str))
+	if begin < end {
+		ret = append(ret, input.Slice(begin, end))
 	}
 	return ret, prefixLength
 }
@@ -187,19 +186,19 @@ func Transform(tokens []Token, withNth []Range) []Token {
 	transTokens := make([]Token, len(withNth))
 	numTokens := len(tokens)
 	for idx, r := range withNth {
-		part := []rune{}
+		parts := []util.Chars{}
 		minIdx := 0
 		if r.begin == r.end {
 			idx := r.begin
 			if idx == rangeEllipsis {
-				part = append(part, joinTokensAsRunes(tokens)...)
+				parts = append(parts, util.RunesToChars(joinTokensAsRunes(tokens)))
 			} else {
 				if idx < 0 {
 					idx += numTokens + 1
 				}
 				if idx >= 1 && idx <= numTokens {
 					minIdx = idx - 1
-					part = append(part, tokens[idx-1].text.ToRunes()...)
+					parts = append(parts, tokens[idx-1].text)
 				}
 			}
 		} else {
@@ -226,17 +225,32 @@ func Transform(tokens []Token, withNth []Range) []Token {
 			minIdx = util.Max(0, begin-1)
 			for idx := begin; idx <= end; idx++ {
 				if idx >= 1 && idx <= numTokens {
-					part = append(part, tokens[idx-1].text.ToRunes()...)
+					parts = append(parts, tokens[idx-1].text)
 				}
 			}
 		}
+		// Merge multiple parts
+		var merged util.Chars
+		switch len(parts) {
+		case 0:
+			merged = util.RunesToChars([]rune{})
+		case 1:
+			merged = parts[0]
+		default:
+			runes := []rune{}
+			for _, part := range parts {
+				runes = append(runes, part.ToRunes()...)
+			}
+			merged = util.RunesToChars(runes)
+		}
+
 		var prefixLength int
 		if minIdx < numTokens {
 			prefixLength = tokens[minIdx].prefixLength
 		} else {
 			prefixLength = 0
 		}
-		transTokens[idx] = Token{util.RunesToChars(part), prefixLength, util.TrimLen(part)}
+		transTokens[idx] = Token{merged, prefixLength, merged.TrimLength()}
 	}
 	return transTokens
 }
diff --git a/src/util/chars.go b/src/util/chars.go
index 25a15ddd..6034ee53 100644
--- a/src/util/chars.go
+++ b/src/util/chars.go
@@ -111,3 +111,10 @@ func (chars *Chars) ToRunes() []rune {
 	}
 	return runes
 }
+
+func (chars *Chars) Slice(b int, e int) Chars {
+	if chars.runes != nil {
+		return Chars{runes: chars.runes[b:e]}
+	}
+	return Chars{bytes: chars.bytes[b:e]}
+}
diff --git a/src/util/chars_test.go b/src/util/chars_test.go
index e42cfb7c..2cb6fc76 100644
--- a/src/util/chars_test.go
+++ b/src/util/chars_test.go
@@ -34,3 +34,24 @@ func TestCharsToString(t *testing.T) {
 		t.Error()
 	}
 }
+
+func TestTrimLength(t *testing.T) {
+	check := func(str string, exp int) {
+		chars := ToChars([]byte(str))
+		trimmed := chars.TrimLength()
+		if trimmed != exp {
+			t.Errorf("Invalid TrimLength result for '%s': %d (expected %d)",
+				str, trimmed, exp)
+		}
+	}
+	check("hello", 5)
+	check("hello ", 5)
+	check("hello  ", 5)
+	check(" hello", 5)
+	check("  hello", 5)
+	check(" hello ", 5)
+	check("  hello  ", 5)
+	check("h   o", 5)
+	check("  h   o  ", 5)
+	check("         ", 0)
+}
diff --git a/src/util/util.go b/src/util/util.go
index 90cc28b4..a95340e7 100644
--- a/src/util/util.go
+++ b/src/util/util.go
@@ -83,30 +83,6 @@ func IsTty() bool {
 	return int(C.isatty(C.int(os.Stdin.Fd()))) != 0
 }
 
-// TrimLen returns the length of trimmed rune array
-func TrimLen(runes []rune) int {
-	var i int
-	for i = len(runes) - 1; i >= 0; i-- {
-		char := runes[i]
-		if char != ' ' && char != '\t' {
-			break
-		}
-	}
-	// Completely empty
-	if i < 0 {
-		return 0
-	}
-
-	var j int
-	for j = 0; j < len(runes); j++ {
-		char := runes[j]
-		if char != ' ' && char != '\t' {
-			break
-		}
-	}
-	return i - j + 1
-}
-
 // ExecCommand executes the given command with $SHELL
 func ExecCommand(command string) *exec.Cmd {
 	shell := os.Getenv("SHELL")
diff --git a/src/util/util_test.go b/src/util/util_test.go
index 8aeaeac5..06cfd4f2 100644
--- a/src/util/util_test.go
+++ b/src/util/util_test.go
@@ -20,23 +20,3 @@ func TestContrain(t *testing.T) {
 		t.Error("Expected", 3)
 	}
 }
-
-func TestTrimLen(t *testing.T) {
-	check := func(str string, exp int) {
-		trimmed := TrimLen([]rune(str))
-		if trimmed != exp {
-			t.Errorf("Invalid TrimLen result for '%s': %d (expected %d)",
-				str, trimmed, exp)
-		}
-	}
-	check("hello", 5)
-	check("hello ", 5)
-	check("hello  ", 5)
-	check(" hello", 5)
-	check("  hello", 5)
-	check(" hello ", 5)
-	check("  hello  ", 5)
-	check("h   o", 5)
-	check("  h   o  ", 5)
-	check("         ", 0)
-}
-- 
GitLab