From d2cb8594dcc7328e7876f6fdf0deb7f8af149622 Mon Sep 17 00:00:00 2001 From: Mzack9999 Date: Wed, 24 Jun 2026 23:57:09 +0200 Subject: [PATCH] reduce memory --- README.md | 4 +- common/httpx/httpx.go | 41 +++--- common/httpx/option.go | 23 +++- common/httpx/response_memory_test.go | 190 +++++++++++++++++++++++++++ 4 files changed, 234 insertions(+), 24 deletions(-) create mode 100644 common/httpx/response_memory_test.go diff --git a/README.md b/README.md index 234b7ae0..e41811d1 100644 --- a/README.md +++ b/README.md @@ -265,8 +265,8 @@ OPTIMIZATIONS: -retries int number of retries -timeout int timeout in seconds (default 10) -delay value duration between each http request (eg: 200ms, 1s) (default -1ns) - -rsts, -response-size-to-save int max response size to save in bytes (default 512000000) - -rstr, -response-size-to-read int max response size to read in bytes (default 512000000) + -rsts, -response-size-to-save int max response size to save in bytes (default 50000000) + -rstr, -response-size-to-read int max response size to read in bytes (default 50000000) CLOUD: -auth configure projectdiscovery cloud (pdcp) api key (default true) diff --git a/common/httpx/httpx.go b/common/httpx/httpx.go index c6d46c4d..7820a022 100644 --- a/common/httpx/httpx.go +++ b/common/httpx/httpx.go @@ -1,6 +1,7 @@ package httpx import ( + "bytes" "context" "crypto/tls" "fmt" @@ -291,21 +292,19 @@ get_response: return nil, closeErr } - // Todo: replace with https://github.com/projectdiscovery/utils/issues/110 - resp.RawData = make([]byte, len(respbody)) - copy(resp.RawData, respbody) + // Keep a reference to the undecoded body. DecodeData returns the same slice + // when no transcoding is needed (the common case), so RawData and Data end up + // sharing the same backing array and we avoid an extra full-body copy. When + // DecodeData transcodes it returns a fresh slice, so RawData still holds the + // original undecoded bytes. Both fields are read-only afterwards, so sharing + // the backing array is safe. + rawbody := respbody respbody, err = DecodeData(respbody, httpresp.Header) if err != nil && !shouldIgnoreBodyErrors { return nil, err } - - respbodystr := string(respbody) - - // check if we need to strip html - if h.Options.VHostStripHTML { - respbodystr = h.htmlPolicy.Sanitize(respbodystr) - } + resp.RawData = rawbody // if content length is not defined if resp.ContentLength <= 0 { @@ -326,11 +325,23 @@ get_response: // fill metrics resp.StatusCode = httpresp.StatusCode - if respbodystr != "" { - // number of words - resp.Words = len(strings.Split(respbodystr, " ")) - // number of lines - resp.Lines = len(strings.Split(strings.TrimSpace(respbodystr), "\n")) + + // Word/line counts are computed directly over the body bytes to avoid + // materializing an extra full-body string copy (and the slice produced by + // strings.Split) on the hot path. When HTML stripping is enabled the + // sanitized string is required, so counts are derived from it to preserve the + // previous behavior. + if h.Options.VHostStripHTML { + respbodystr := h.htmlPolicy.Sanitize(string(respbody)) + if respbodystr != "" { + resp.Words = len(strings.Split(respbodystr, " ")) + resp.Lines = len(strings.Split(strings.TrimSpace(respbodystr), "\n")) + } + } else if len(respbody) > 0 { + // equivalent to len(strings.Split(string(respbody), " ")) and + // len(strings.Split(strings.TrimSpace(string(respbody)), "\n")) + resp.Words = bytes.Count(respbody, []byte{' '}) + 1 + resp.Lines = bytes.Count(bytes.TrimSpace(respbody), []byte{'\n'}) + 1 } if !h.Options.Unsafe && h.Options.TLSGrab { diff --git a/common/httpx/option.go b/common/httpx/option.go index 56d11cbc..ce43aad7 100644 --- a/common/httpx/option.go +++ b/common/httpx/option.go @@ -10,13 +10,22 @@ import ( "github.com/projectdiscovery/networkpolicy" ) -// DefaultMaxResponseBodySize is the default maximum response body size -var DefaultMaxResponseBodySize int64 - -func init() { - maxResponseBodySize, _ := humanize.ParseBytes("512Mb") - DefaultMaxResponseBodySize = int64(maxResponseBodySize) -} +// DefaultMaxResponseBodySize is the default maximum response body size that httpx +// reads into memory for processing (and, via the runner, the default cap for +// responses stored to disk with -sr). It is intentionally bounded: the body is +// held in memory and the footprint scales with the number of concurrent threads, +// so a very large cap can lead to excessive memory usage / OOM on large +// responses. Normal web pages are far smaller than this; use -rstr / -rsts to +// read or store larger responses when needed. +// +// NOTE: this is a var initializer (not an init() function) on purpose. init() +// functions run after all package-level variable initializers, so computing the +// value in init() left DefaultOptions (which references it below) observing a +// zero value during package initialization. +var DefaultMaxResponseBodySize = func() int64 { + maxResponseBodySize, _ := humanize.ParseBytes("50mb") + return int64(maxResponseBodySize) +}() // Options contains configuration options for the client type Options struct { diff --git a/common/httpx/response_memory_test.go b/common/httpx/response_memory_test.go new file mode 100644 index 00000000..12ae7293 --- /dev/null +++ b/common/httpx/response_memory_test.go @@ -0,0 +1,190 @@ +package httpx + +import ( + "bytes" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/projectdiscovery/retryablehttp-go" + "github.com/stretchr/testify/require" + "golang.org/x/text/encoding/simplifiedchinese" + "golang.org/x/text/transform" +) + +// newLocalHTTPX builds an HTTPX instance suitable for hitting a local test +// server only (no external network, CDN checks disabled). +func newLocalHTTPX(t *testing.T) *HTTPX { + t.Helper() + options := DefaultOptions + options.CdnCheck = "false" + options.Timeout = 5 * time.Second + options.RetryMax = 0 + // NB: relies on DefaultOptions.MaxResponseBodySizeToRead being non-zero + // (see TestDefaultOptionsHasNonZeroReadSize) so the body is actually read. + + ht, err := New(&options) + require.NoError(t, err) + return ht +} + +// doLocal issues a GET against a local httptest server and returns the parsed +// httpx Response. +func doLocal(t *testing.T, ht *HTTPX, url string) *Response { + t.Helper() + req, err := retryablehttp.NewRequest(http.MethodGet, url, nil) + require.NoError(t, err) + resp, err := ht.Do(req, UnsafeOptions{}) + require.NoError(t, err) + return resp +} + +// legacyWordsLines reproduces the exact word/line computation that existed +// before the refactor, so we can assert the new byte-based path is equivalent. +func legacyWordsLines(body []byte) (words, lines int) { + s := string(body) + if s != "" { + words = len(strings.Split(s, " ")) + lines = len(strings.Split(strings.TrimSpace(s), "\n")) + } + return +} + +// TestDefaultOptionsHasNonZeroReadSize guards against the package var-init +// ordering regression where DefaultOptions was initialized before +// DefaultMaxResponseBodySize, leaving MaxResponseBodySizeToRead at 0 (which made +// LimitReader read zero bytes and produced empty bodies for library users). +func TestDefaultOptionsHasNonZeroReadSize(t *testing.T) { + require.NotZero(t, DefaultMaxResponseBodySize) + require.Equal(t, DefaultMaxResponseBodySize, DefaultOptions.MaxResponseBodySizeToRead) +} + +func TestDoBodyNoDecodePreservesRawAndData(t *testing.T) { + body := []byte("hello world\nsecond line\n") + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write(body) + })) + defer ts.Close() + + resp := doLocal(t, newLocalHTTPX(t), ts.URL) + + require.Equal(t, body, resp.Data, "decoded data must equal body") + require.Equal(t, body, resp.RawData, "raw data must equal undecoded body") + + wantWords, wantLines := legacyWordsLines(body) + require.Equal(t, wantWords, resp.Words) + require.Equal(t, wantLines, resp.Lines) +} + +func TestDoBodyEmpty(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + })) + defer ts.Close() + + resp := doLocal(t, newLocalHTTPX(t), ts.URL) + require.Empty(t, resp.Data) + require.Empty(t, resp.RawData) + require.Equal(t, 0, resp.Words) + require.Equal(t, 0, resp.Lines) +} + +// TestDoBodyGBKDecodeKeepsRawUndecoded ensures that when DecodeData actually +// transcodes the body, RawData still holds the original (undecoded) bytes while +// Data holds the decoded UTF-8 bytes. +func TestDoBodyGBKDecodeKeepsRawUndecoded(t *testing.T) { + utf8Body := "你好世界 测试" + gbkBody, _, err := transform.Bytes(simplifiedchinese.GBK.NewEncoder(), []byte(utf8Body)) + require.NoError(t, err) + require.NotEqual(t, []byte(utf8Body), gbkBody, "precondition: gbk bytes differ from utf8") + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html; charset=gbk") + _, _ = w.Write(gbkBody) + })) + defer ts.Close() + + resp := doLocal(t, newLocalHTTPX(t), ts.URL) + + require.Equal(t, gbkBody, resp.RawData, "RawData must hold the original undecoded bytes") + require.Equal(t, []byte(utf8Body), resp.Data, "Data must hold the decoded UTF-8 bytes") +} + +// TestDoBodyNoDecodeSharesBacking documents the memory optimization: on the +// no-decode hot path RawData and Data share the same backing array (no extra +// full-body copy is made). +func TestDoBodyNoDecodeSharesBacking(t *testing.T) { + body := []byte("shared backing array body") + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write(body) + })) + defer ts.Close() + + resp := doLocal(t, newLocalHTTPX(t), ts.URL) + + require.NotEmpty(t, resp.Data) + require.NotEmpty(t, resp.RawData) + require.Equal(t, resp.Data, resp.RawData) + require.Same(t, &resp.Data[0], &resp.RawData[0], + "RawData and Data should share the backing array on the no-decode path") +} + +// TestWordsLinesEquivalence is the core guard for the refactor: the byte-based +// counting used on the hot path must be identical to the previous +// strings.Split-based counting for a wide range of inputs and edge cases. +func TestWordsLinesEquivalence(t *testing.T) { + cases := []string{ + "", + "a", + "a b c", + " ", // only spaces + "a b", // consecutive spaces + "line1\nline2\nline3", // multiple lines + "\n\n\n", // only newlines + " leading and trailing ", // surrounding whitespace + "\n mixed \t whitespace \n", // tabs/newlines around + "trailing newline\n", + "word", + "tab\tseparated values", + "unicode \u00a0 nbsp space", + "emoji 😀 and spaces ", + } + + for _, c := range cases { + body := []byte(c) + wantWords, wantLines := legacyWordsLines(body) + + var gotWords, gotLines int + if len(body) > 0 { + gotWords = bytes.Count(body, []byte{' '}) + 1 + gotLines = bytes.Count(bytes.TrimSpace(body), []byte{'\n'}) + 1 + } + + require.Equalf(t, wantWords, gotWords, "words mismatch for %q", c) + require.Equalf(t, wantLines, gotLines, "lines mismatch for %q", c) + } +} + +// TestBodyMetricsCountingDoesNotAllocate locks in the optimization: the +// byte-based word/line counting used on the hot path must not allocate (the +// previous string(respbody) + strings.Split approach allocated O(len(body))). +// If someone reintroduces a full-body string copy or Split-based counting, this +// test fails. +func TestBodyMetricsCountingDoesNotAllocate(t *testing.T) { + body := bytes.Repeat([]byte("lorem ipsum dolor sit amet\n"), 40000) // ~1MB + var words, lines int + + allocs := testing.AllocsPerRun(50, func() { + // identical expressions to the hot path in Do() + words = bytes.Count(body, []byte{' '}) + 1 + lines = bytes.Count(bytes.TrimSpace(body), []byte{'\n'}) + 1 + }) + + require.NotZero(t, words) + require.NotZero(t, lines) + require.Zerof(t, allocs, "word/line counting must not allocate, got %v allocs/op", allocs) +}