From d2cb8594dcc7328e7876f6fdf0deb7f8af149622 Mon Sep 17 00:00:00 2001
From: Mzack9999 <mzack9999@protonmail.com>
Date: Wed, 24 Jun 2026 23:57:09 +0200
Subject: [PATCH] reduce memory

---
 README.md                            |   4 +-
 common/httpx/httpx.go                |  41 +++---
 common/httpx/option.go               |  23 +++-
 common/httpx/response_memory_test.go | 190 +++++++++++++++++++++++++++
 4 files changed, 234 insertions(+), 24 deletions(-)
 create mode 100644 common/httpx/response_memory_test.go

diff --git a/README.md b/README.md
index 234b7ae0..e41811d1 100644
--- a/README.md
+++ b/README.md
@@ -265,8 +265,8 @@ OPTIMIZATIONS:
    -retries int                       number of retries
    -timeout int                       timeout in seconds (default 10)
    -delay value                       duration between each http request (eg: 200ms, 1s) (default -1ns)
-   -rsts, -response-size-to-save int  max response size to save in bytes (default 512000000)
-   -rstr, -response-size-to-read int  max response size to read in bytes (default 512000000)
+   -rsts, -response-size-to-save int  max response size to save in bytes (default 50000000)
+   -rstr, -response-size-to-read int  max response size to read in bytes (default 50000000)
 
 CLOUD:
    -auth                           configure projectdiscovery cloud (pdcp) api key (default true)
diff --git a/common/httpx/httpx.go b/common/httpx/httpx.go
index c6d46c4d..7820a022 100644
--- a/common/httpx/httpx.go
+++ b/common/httpx/httpx.go
@@ -1,6 +1,7 @@
 package httpx
 
 import (
+	"bytes"
 	"context"
 	"crypto/tls"
 	"fmt"
@@ -291,21 +292,19 @@ get_response:
 		return nil, closeErr
 	}
 
-	// Todo: replace with https://github.com/projectdiscovery/utils/issues/110
-	resp.RawData = make([]byte, len(respbody))
-	copy(resp.RawData, respbody)
+	// Keep a reference to the undecoded body. DecodeData returns the same slice
+	// when no transcoding is needed (the common case), so RawData and Data end up
+	// sharing the same backing array and we avoid an extra full-body copy. When
+	// DecodeData transcodes it returns a fresh slice, so RawData still holds the
+	// original undecoded bytes. Both fields are read-only afterwards, so sharing
+	// the backing array is safe.
+	rawbody := respbody
 
 	respbody, err = DecodeData(respbody, httpresp.Header)
 	if err != nil && !shouldIgnoreBodyErrors {
 		return nil, err
 	}
-
-	respbodystr := string(respbody)
-
-	// check if we need to strip html
-	if h.Options.VHostStripHTML {
-		respbodystr = h.htmlPolicy.Sanitize(respbodystr)
-	}
+	resp.RawData = rawbody
 
 	// if content length is not defined
 	if resp.ContentLength <= 0 {
@@ -326,11 +325,23 @@ get_response:
 
 	// fill metrics
 	resp.StatusCode = httpresp.StatusCode
-	if respbodystr != "" {
-		// number of words
-		resp.Words = len(strings.Split(respbodystr, " "))
-		// number of lines
-		resp.Lines = len(strings.Split(strings.TrimSpace(respbodystr), "\n"))
+
+	// Word/line counts are computed directly over the body bytes to avoid
+	// materializing an extra full-body string copy (and the slice produced by
+	// strings.Split) on the hot path. When HTML stripping is enabled the
+	// sanitized string is required, so counts are derived from it to preserve the
+	// previous behavior.
+	if h.Options.VHostStripHTML {
+		respbodystr := h.htmlPolicy.Sanitize(string(respbody))
+		if respbodystr != "" {
+			resp.Words = len(strings.Split(respbodystr, " "))
+			resp.Lines = len(strings.Split(strings.TrimSpace(respbodystr), "\n"))
+		}
+	} else if len(respbody) > 0 {
+		// equivalent to len(strings.Split(string(respbody), " ")) and
+		// len(strings.Split(strings.TrimSpace(string(respbody)), "\n"))
+		resp.Words = bytes.Count(respbody, []byte{' '}) + 1
+		resp.Lines = bytes.Count(bytes.TrimSpace(respbody), []byte{'\n'}) + 1
 	}
 
 	if !h.Options.Unsafe && h.Options.TLSGrab {
diff --git a/common/httpx/option.go b/common/httpx/option.go
index 56d11cbc..ce43aad7 100644
--- a/common/httpx/option.go
+++ b/common/httpx/option.go
@@ -10,13 +10,22 @@ import (
 	"github.com/projectdiscovery/networkpolicy"
 )
 
-// DefaultMaxResponseBodySize is the default maximum response body size
-var DefaultMaxResponseBodySize int64
-
-func init() {
-	maxResponseBodySize, _ := humanize.ParseBytes("512Mb")
-	DefaultMaxResponseBodySize = int64(maxResponseBodySize)
-}
+// DefaultMaxResponseBodySize is the default maximum response body size that httpx
+// reads into memory for processing (and, via the runner, the default cap for
+// responses stored to disk with -sr). It is intentionally bounded: the body is
+// held in memory and the footprint scales with the number of concurrent threads,
+// so a very large cap can lead to excessive memory usage / OOM on large
+// responses. Normal web pages are far smaller than this; use -rstr / -rsts to
+// read or store larger responses when needed.
+//
+// NOTE: this is a var initializer (not an init() function) on purpose. init()
+// functions run after all package-level variable initializers, so computing the
+// value in init() left DefaultOptions (which references it below) observing a
+// zero value during package initialization.
+var DefaultMaxResponseBodySize = func() int64 {
+	maxResponseBodySize, _ := humanize.ParseBytes("50mb")
+	return int64(maxResponseBodySize)
+}()
 
 // Options contains configuration options for the client
 type Options struct {
diff --git a/common/httpx/response_memory_test.go b/common/httpx/response_memory_test.go
new file mode 100644
index 00000000..12ae7293
--- /dev/null
+++ b/common/httpx/response_memory_test.go
@@ -0,0 +1,190 @@
+package httpx
+
+import (
+	"bytes"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/projectdiscovery/retryablehttp-go"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/text/encoding/simplifiedchinese"
+	"golang.org/x/text/transform"
+)
+
+// newLocalHTTPX builds an HTTPX instance suitable for hitting a local test
+// server only (no external network, CDN checks disabled).
+func newLocalHTTPX(t *testing.T) *HTTPX {
+	t.Helper()
+	options := DefaultOptions
+	options.CdnCheck = "false"
+	options.Timeout = 5 * time.Second
+	options.RetryMax = 0
+	// NB: relies on DefaultOptions.MaxResponseBodySizeToRead being non-zero
+	// (see TestDefaultOptionsHasNonZeroReadSize) so the body is actually read.
+
+	ht, err := New(&options)
+	require.NoError(t, err)
+	return ht
+}
+
+// doLocal issues a GET against a local httptest server and returns the parsed
+// httpx Response.
+func doLocal(t *testing.T, ht *HTTPX, url string) *Response {
+	t.Helper()
+	req, err := retryablehttp.NewRequest(http.MethodGet, url, nil)
+	require.NoError(t, err)
+	resp, err := ht.Do(req, UnsafeOptions{})
+	require.NoError(t, err)
+	return resp
+}
+
+// legacyWordsLines reproduces the exact word/line computation that existed
+// before the refactor, so we can assert the new byte-based path is equivalent.
+func legacyWordsLines(body []byte) (words, lines int) {
+	s := string(body)
+	if s != "" {
+		words = len(strings.Split(s, " "))
+		lines = len(strings.Split(strings.TrimSpace(s), "\n"))
+	}
+	return
+}
+
+// TestDefaultOptionsHasNonZeroReadSize guards against the package var-init
+// ordering regression where DefaultOptions was initialized before
+// DefaultMaxResponseBodySize, leaving MaxResponseBodySizeToRead at 0 (which made
+// LimitReader read zero bytes and produced empty bodies for library users).
+func TestDefaultOptionsHasNonZeroReadSize(t *testing.T) {
+	require.NotZero(t, DefaultMaxResponseBodySize)
+	require.Equal(t, DefaultMaxResponseBodySize, DefaultOptions.MaxResponseBodySizeToRead)
+}
+
+func TestDoBodyNoDecodePreservesRawAndData(t *testing.T) {
+	body := []byte("hello world\nsecond line\n")
+	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		_, _ = w.Write(body)
+	}))
+	defer ts.Close()
+
+	resp := doLocal(t, newLocalHTTPX(t), ts.URL)
+
+	require.Equal(t, body, resp.Data, "decoded data must equal body")
+	require.Equal(t, body, resp.RawData, "raw data must equal undecoded body")
+
+	wantWords, wantLines := legacyWordsLines(body)
+	require.Equal(t, wantWords, resp.Words)
+	require.Equal(t, wantLines, resp.Lines)
+}
+
+func TestDoBodyEmpty(t *testing.T) {
+	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+	}))
+	defer ts.Close()
+
+	resp := doLocal(t, newLocalHTTPX(t), ts.URL)
+	require.Empty(t, resp.Data)
+	require.Empty(t, resp.RawData)
+	require.Equal(t, 0, resp.Words)
+	require.Equal(t, 0, resp.Lines)
+}
+
+// TestDoBodyGBKDecodeKeepsRawUndecoded ensures that when DecodeData actually
+// transcodes the body, RawData still holds the original (undecoded) bytes while
+// Data holds the decoded UTF-8 bytes.
+func TestDoBodyGBKDecodeKeepsRawUndecoded(t *testing.T) {
+	utf8Body := "<html><head></head><body>你好世界 测试</body></html>"
+	gbkBody, _, err := transform.Bytes(simplifiedchinese.GBK.NewEncoder(), []byte(utf8Body))
+	require.NoError(t, err)
+	require.NotEqual(t, []byte(utf8Body), gbkBody, "precondition: gbk bytes differ from utf8")
+
+	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html; charset=gbk")
+		_, _ = w.Write(gbkBody)
+	}))
+	defer ts.Close()
+
+	resp := doLocal(t, newLocalHTTPX(t), ts.URL)
+
+	require.Equal(t, gbkBody, resp.RawData, "RawData must hold the original undecoded bytes")
+	require.Equal(t, []byte(utf8Body), resp.Data, "Data must hold the decoded UTF-8 bytes")
+}
+
+// TestDoBodyNoDecodeSharesBacking documents the memory optimization: on the
+// no-decode hot path RawData and Data share the same backing array (no extra
+// full-body copy is made).
+func TestDoBodyNoDecodeSharesBacking(t *testing.T) {
+	body := []byte("shared backing array body")
+	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/plain")
+		_, _ = w.Write(body)
+	}))
+	defer ts.Close()
+
+	resp := doLocal(t, newLocalHTTPX(t), ts.URL)
+
+	require.NotEmpty(t, resp.Data)
+	require.NotEmpty(t, resp.RawData)
+	require.Equal(t, resp.Data, resp.RawData)
+	require.Same(t, &resp.Data[0], &resp.RawData[0],
+		"RawData and Data should share the backing array on the no-decode path")
+}
+
+// TestWordsLinesEquivalence is the core guard for the refactor: the byte-based
+// counting used on the hot path must be identical to the previous
+// strings.Split-based counting for a wide range of inputs and edge cases.
+func TestWordsLinesEquivalence(t *testing.T) {
+	cases := []string{
+		"",
+		"a",
+		"a b c",
+		"   ",                        // only spaces
+		"a  b",                       // consecutive spaces
+		"line1\nline2\nline3",        // multiple lines
+		"\n\n\n",                     // only newlines
+		"  leading and trailing  ",   // surrounding whitespace
+		"\n  mixed \t whitespace \n", // tabs/newlines around
+		"trailing newline\n",
+		"word",
+		"tab\tseparated values",
+		"unicode \u00a0 nbsp space",
+		"emoji 😀 and spaces ",
+	}
+
+	for _, c := range cases {
+		body := []byte(c)
+		wantWords, wantLines := legacyWordsLines(body)
+
+		var gotWords, gotLines int
+		if len(body) > 0 {
+			gotWords = bytes.Count(body, []byte{' '}) + 1
+			gotLines = bytes.Count(bytes.TrimSpace(body), []byte{'\n'}) + 1
+		}
+
+		require.Equalf(t, wantWords, gotWords, "words mismatch for %q", c)
+		require.Equalf(t, wantLines, gotLines, "lines mismatch for %q", c)
+	}
+}
+
+// TestBodyMetricsCountingDoesNotAllocate locks in the optimization: the
+// byte-based word/line counting used on the hot path must not allocate (the
+// previous string(respbody) + strings.Split approach allocated O(len(body))).
+// If someone reintroduces a full-body string copy or Split-based counting, this
+// test fails.
+func TestBodyMetricsCountingDoesNotAllocate(t *testing.T) {
+	body := bytes.Repeat([]byte("lorem ipsum dolor sit amet\n"), 40000) // ~1MB
+	var words, lines int
+
+	allocs := testing.AllocsPerRun(50, func() {
+		// identical expressions to the hot path in Do()
+		words = bytes.Count(body, []byte{' '}) + 1
+		lines = bytes.Count(bytes.TrimSpace(body), []byte{'\n'}) + 1
+	})
+
+	require.NotZero(t, words)
+	require.NotZero(t, lines)
+	require.Zerof(t, allocs, "word/line counting must not allocate, got %v allocs/op", allocs)
+}