drover-go/internal/checker/checker.go

package checker

import (
	"context"
	"crypto/tls"
	"encoding/binary"
	"errors"
	"fmt"
	"net"
	"net/http"
	"regexp"
	"strconv"
	"time"
)

// Status represents the lifecycle state of a single test.
type Status string

// Result statuses emitted on the channel.
const (
	StatusRunning Status = "running"
	StatusPassed  Status = "passed"
	StatusFailed  Status = "failed"
	StatusSkipped Status = "skipped"
)

// Result is one event in the diagnostic stream. Multiple Results may be
// emitted per test (one per attempt: running + passed/failed; on retry,
// running again then passed/failed).
type Result struct {
	ID       string        `json:"id"`
	Status   Status        `json:"status"`
	Metric   string        `json:"metric,omitempty"`
	Error    string        `json:"error,omitempty"`
	Hint     string        `json:"hint,omitempty"`
	RawHex   string        `json:"raw_hex,omitempty"`
	Duration time.Duration `json:"duration_ms"`
	Attempt  int           `json:"attempt"`
}

// Config drives Run. Zero-value fields receive defaults via applyDefaults.
type Config struct {
	ProxyHost     string
	ProxyPort     int
	UseAuth       bool
	ProxyLogin    string
	ProxyPassword string

	PerTestTimeout time.Duration
	MaxRetries     int
	RetryBackoff   time.Duration

	DiscordGateway string
	DiscordAPI     string
	StunServer     string
}

// applyDefaults returns a copy of cfg with zero-valued knobs filled in.
func applyDefaults(cfg Config) Config {
	if cfg.PerTestTimeout <= 0 {
		cfg.PerTestTimeout = 5 * time.Second
	}
	if cfg.MaxRetries < 0 {
		cfg.MaxRetries = 0
	}
	if cfg.MaxRetries == 0 {
		// Distinguish "explicit 0" from "unset" — spec says default is 1.
		// applyDefaults runs on a copy of the caller's Config; we treat
		// a literal zero as "use default" so a fresh `Config{}` works.
		cfg.MaxRetries = 1
	}
	if cfg.RetryBackoff < 0 {
		cfg.RetryBackoff = 500 * time.Millisecond
	}
	if cfg.RetryBackoff == 0 {
		cfg.RetryBackoff = 500 * time.Millisecond
	}
	if cfg.DiscordGateway == "" {
		cfg.DiscordGateway = "gateway.discord.gg:443"
	}
	if cfg.DiscordAPI == "" {
		cfg.DiscordAPI = "https://discord.com/api/v9/gateway"
	}
	if cfg.StunServer == "" {
		cfg.StunServer = "stun.l.google.com:19302"
	}
	return cfg
}

// Run executes the 7-step diagnostic and streams Results on the returned
// channel. The channel is closed when the run finishes (or is cancelled).
//
// Cancel ctx to abort: the in-flight test emits a Failed Result with
// Error="cancelled", and remaining tests each emit a single Skipped Result.
func Run(ctx context.Context, cfg Config) <-chan Result {
	cfg = applyDefaults(cfg)
	ch := make(chan Result, 16)

	go func() {
		defer close(ch)
		e := &executor{ctx: ctx, cfg: cfg, ch: ch}
		defer e.cleanup()

		e.runTCP()
		e.runGreet()
		if cfg.UseAuth {
			e.runAuth()
		}
		e.runConnect()
		e.runUDP()
		e.runStun()
		e.runAPI()
	}()

	return ch
}

// executor carries shared state across the 7 test methods.
type executor struct {
	ctx context.Context
	cfg Config
	ch  chan<- Result

	// tcpConn is opened in runTCP and reused by greet/auth/connect.
	tcpConn net.Conn

	// udpConn2 is the SECOND TCP control channel opened in runUDP.
	// Must stay alive until stun finishes — the SOCKS5 spec requires
	// the control TCP connection to remain up for the relay to be
	// valid.
	udpConn2 net.Conn

	// udpRelay is the UDP relay endpoint announced by the proxy in
	// the UDP ASSOCIATE reply.
	udpRelay *net.UDPAddr

	// udpClient is our local UDP socket used to talk to the relay.
	udpClient net.PacketConn

	// Step gating: each xOK is set true on success.
	tcpOK, greetOK, authOK, connectOK, udpOK bool

	// Cancellation latch. Once any test emits a "cancelled" failure,
	// remaining tests emit a single Skipped result with the same reason.
	cancelled bool
}

// cleanup closes any state opened during the run.
func (e *executor) cleanup() {
	if e.tcpConn != nil {
		_ = e.tcpConn.Close()
	}
	if e.udpConn2 != nil {
		_ = e.udpConn2.Close()
	}
	if e.udpClient != nil {
		_ = e.udpClient.Close()
	}
}

// emit sends a Result on the channel, respecting ctx so a stalled consumer
// doesn't block us forever.
func (e *executor) emit(r Result) {
	select {
	case e.ch <- r:
	case <-e.ctx.Done():
		// Best-effort: try once more so we don't drop user-visible
		// information just because cancel raced the send.
		select {
		case e.ch <- r:
		default:
		}
	}
}

// emitSkipped pushes a single skipped Result with a constant reason.
func (e *executor) emitSkipped(id, reason string) {
	e.emit(Result{ID: id, Status: StatusSkipped, Error: reason})
}

// emitCancelled pushes a single failed Result with Error="cancelled".
func (e *executor) emitCancelled(id string, attempt int, dur time.Duration) {
	e.cancelled = true
	e.emit(Result{
		ID:       id,
		Status:   StatusFailed,
		Error:    "cancelled",
		Hint:     hintFor(id, context.Canceled),
		Attempt:  attempt,
		Duration: dur,
	})
}

// shouldSkip checks high-level guard conditions and emits the appropriate
// pre-test Result if we shouldn't run. Returns true if the caller should
// abort the test.
func (e *executor) shouldSkip(id string, depOK bool) bool {
	if e.cancelled {
		e.emitSkipped(id, "cancelled")
		return true
	}
	if !depOK {
		e.emitSkipped(id, skipReason)
		return true
	}
	if err := e.ctx.Err(); err != nil {
		e.emitCancelled(id, 1, 0)
		return true
	}
	return false
}

const skipReason = "depends on previous failed step"

// rawHexRE pulls "...(raw=DEADBEEF)" out of a wrapped error string.
var rawHexRE = regexp.MustCompile(`\(raw=([0-9a-fA-F]+)\)`)

// extractRawHex pulls the hex payload out of our `(raw=XX...)` error
// wrapping convention. Returns "" if absent.
func extractRawHex(s string) string {
	m := rawHexRE.FindStringSubmatch(s)
	if len(m) == 2 {
		return m[1]
	}
	return ""
}

// runAttempt is the inner loop shared by all tests. It handles emitting
// running/passed/failed results, retry classification and backoff.
//
// run does the actual work for one attempt and returns metric + err.
func (e *executor) runAttempt(id string, run func(ctx context.Context) (string, error)) (ok bool) {
	maxAttempts := 1 + e.cfg.MaxRetries
	for attempt := 1; attempt <= maxAttempts; attempt++ {
		if err := e.ctx.Err(); err != nil {
			e.emitCancelled(id, attempt, 0)
			return false
		}

		// Emit running for this attempt.
		e.emit(Result{ID: id, Status: StatusRunning, Attempt: attempt})

		attemptCtx, cancel := context.WithTimeout(e.ctx, e.cfg.PerTestTimeout)
		start := time.Now()
		metric, err := run(attemptCtx)
		dur := time.Since(start)
		cancel()

		if err == nil {
			e.emit(Result{
				ID:       id,
				Status:   StatusPassed,
				Metric:   metric,
				Attempt:  attempt,
				Duration: dur,
			})
			return true
		}

		// Parent-ctx cancelled? Emit cancelled and stop (no retry
		// into a cancelled context). We check the PARENT ctx, not
		// attemptCtx (which always expires after PerTestTimeout).
		if e.ctx.Err() != nil {
			e.emitCancelled(id, attempt, dur)
			return false
		}

		// Per-attempt deadline expired (PerTestTimeout fired) —
		// treat as a transient timeout. We need to override
		// classifyError here because err's chain contains
		// context.DeadlineExceeded (joinCtxErr embeds attemptCtx.Err)
		// which classifyError treats as permanent. The semantic
		// distinction is "our per-test budget vs caller cancel" —
		// the former is exactly what retries are for.
		var class Classification
		if isContextErr(err) {
			// Parent ctx is fine (checked above), so this is a
			// per-attempt deadline = transient.
			class = ClassificationTransient
		} else {
			class = classifyError(err)
		}
		canRetry := class == ClassificationTransient && attempt < maxAttempts
		if canRetry {
			// Failed-but-will-retry: still emit Failed for the
			// observer (so they see the attempt happened), but
			// loop. Some consumers only show the LAST failure;
			// emitting every attempt is the more transparent
			// option. Spec says "emit running + passed/failed
			// per attempt".
			e.emit(Result{
				ID:       id,
				Status:   StatusFailed,
				Error:    err.Error(),
				Hint:     hintFor(id, err),
				RawHex:   extractRawHex(err.Error()),
				Attempt:  attempt,
				Duration: dur,
			})
			// Sleep with cancel awareness.
			select {
			case <-time.After(e.cfg.RetryBackoff):
			case <-e.ctx.Done():
				// Caller cancelled during backoff — stop without retry.
				return false
			}
			continue
		}

		// Final failure (permanent or out of retries).
		e.emit(Result{
			ID:       id,
			Status:   StatusFailed,
			Error:    err.Error(),
			Hint:     hintFor(id, err),
			RawHex:   extractRawHex(err.Error()),
			Attempt:  attempt,
			Duration: dur,
		})
		return false
	}
	return false
}

// proxyAddr returns the SOCKS5 proxy host:port string.
func (e *executor) proxyAddr() string {
	return net.JoinHostPort(e.cfg.ProxyHost, strconv.Itoa(e.cfg.ProxyPort))
}

// runTCP — Test 1: dial the proxy.
func (e *executor) runTCP() {
	if e.cancelled {
		e.emitSkipped("tcp", "cancelled")
		return
	}
	if err := e.ctx.Err(); err != nil {
		e.emitCancelled("tcp", 1, 0)
		return
	}

	ok := e.runAttempt("tcp", func(ctx context.Context) (string, error) {
		// Close any prior conn from a previous attempt.
		if e.tcpConn != nil {
			_ = e.tcpConn.Close()
			e.tcpConn = nil
		}
		var d net.Dialer
		start := time.Now()
		conn, err := d.DialContext(ctx, "tcp", e.proxyAddr())
		if err != nil {
			return "", err
		}
		e.tcpConn = conn
		ms := time.Since(start).Milliseconds()
		return fmt.Sprintf("%dms", ms), nil
	})

	e.tcpOK = ok
}

// runGreet — Test 2: SOCKS5 method negotiation.
func (e *executor) runGreet() {
	if e.shouldSkip("greet", e.tcpOK) {
		return
	}

	ok := e.runAttempt("greet", func(ctx context.Context) (string, error) {
		// Each attempt needs a fresh conn — the previous attempt
		// may have written bytes that left the proxy mid-handshake.
		if err := e.redialTCPIfNeeded(ctx); err != nil {
			return "", err
		}
		method, _, err := socks5Greeting(ctx, e.tcpConn, e.cfg.UseAuth)
		if err != nil {
			// Force redial on next attempt.
			_ = e.tcpConn.Close()
			e.tcpConn = nil
			return "", err
		}
		switch method {
		case 0x00:
			return "no auth", nil
		case 0x02:
			return "auth required", nil
		default:
			return fmt.Sprintf("method=0x%02X", method), nil
		}
	})
	e.greetOK = ok
}

// redialTCPIfNeeded drops and re-opens tcpConn. This is called at the
// start of each greet/auth/connect attempt after the first to give every
// attempt a fresh connection — the proxy may have advanced state on the
// previous attempt that we can't roll back.
//
// On the FIRST attempt for greet, we expect tcpConn to already be open
// (from runTCP). The simple rule: if tcpConn==nil, redial; otherwise
// keep it. The retry path closes tcpConn before re-running this loop.
func (e *executor) redialTCPIfNeeded(ctx context.Context) error {
	if e.tcpConn != nil {
		return nil
	}
	var d net.Dialer
	conn, err := d.DialContext(ctx, "tcp", e.proxyAddr())
	if err != nil {
		return err
	}
	e.tcpConn = conn
	return nil
}

// runAuth — Test 3: user/pass sub-negotiation. Only emitted when UseAuth.
func (e *executor) runAuth() {
	if e.shouldSkip("auth", e.greetOK) {
		return
	}

	ok := e.runAttempt("auth", func(ctx context.Context) (string, error) {
		// On retry: drop the conn and start fresh from greet+auth.
		// (We can't replay only auth — the proxy has already moved
		// past method negotiation.)
		// retry detection: if we have nil tcpConn here, we lost it
		// in a prior failed attempt and need to redial+regreet.
		if e.tcpConn == nil {
			var d net.Dialer
			conn, derr := d.DialContext(ctx, "tcp", e.proxyAddr())
			if derr != nil {
				return "", derr
			}
			e.tcpConn = conn
			if _, _, gerr := socks5Greeting(ctx, e.tcpConn, true); gerr != nil {
				return "", gerr
			}
		}
		_, err := socks5Auth(ctx, e.tcpConn, e.cfg.ProxyLogin, e.cfg.ProxyPassword)
		if err != nil {
			// Force redial+regreet on next attempt.
			_ = e.tcpConn.Close()
			e.tcpConn = nil
			return "", err
		}
		return "ok", nil
	})
	e.authOK = ok
}

// runConnect — Test 4: SOCKS5 CONNECT to Discord gateway.
func (e *executor) runConnect() {
	dep := e.greetOK && (!e.cfg.UseAuth || e.authOK)
	if e.shouldSkip("connect", dep) {
		return
	}

	host, portStr, splitErr := net.SplitHostPort(e.cfg.DiscordGateway)
	if splitErr != nil {
		e.emit(Result{
			ID:      "connect",
			Status:  StatusFailed,
			Error:   fmt.Sprintf("bad DiscordGateway %q: %s", e.cfg.DiscordGateway, splitErr.Error()),
			Hint:    hintFor("connect", splitErr),
			Attempt: 1,
		})
		return
	}
	port64, perr := strconv.ParseUint(portStr, 10, 16)
	if perr != nil {
		e.emit(Result{
			ID:      "connect",
			Status:  StatusFailed,
			Error:   fmt.Sprintf("bad DiscordGateway port %q: %s", portStr, perr.Error()),
			Hint:    hintFor("connect", perr),
			Attempt: 1,
		})
		return
	}
	port := uint16(port64)

	ok := e.runAttempt("connect", func(ctx context.Context) (string, error) {
		// On retry: redial+greet+(auth) before re-CONNECT.
		if e.tcpConn == nil {
			var d net.Dialer
			conn, derr := d.DialContext(ctx, "tcp", e.proxyAddr())
			if derr != nil {
				return "", derr
			}
			e.tcpConn = conn
			if _, _, gerr := socks5Greeting(ctx, e.tcpConn, e.cfg.UseAuth); gerr != nil {
				return "", gerr
			}
			if e.cfg.UseAuth {
				if _, aerr := socks5Auth(ctx, e.tcpConn, e.cfg.ProxyLogin, e.cfg.ProxyPassword); aerr != nil {
					return "", aerr
				}
			}
		}
		_, err := socks5Connect(ctx, e.tcpConn, host, port)
		if err != nil {
			_ = e.tcpConn.Close()
			e.tcpConn = nil
			return "", err
		}
		return "REP=00", nil
	})
	e.connectOK = ok
}

// runUDP — Test 5: open second TCP control channel and UDP ASSOCIATE.
func (e *executor) runUDP() {
	dep := e.greetOK && (!e.cfg.UseAuth || e.authOK)
	if e.shouldSkip("udp", dep) {
		return
	}

	ok := e.runAttempt("udp", func(ctx context.Context) (string, error) {
		// Always use a fresh control channel for UDP ASSOCIATE.
		if e.udpConn2 != nil {
			_ = e.udpConn2.Close()
			e.udpConn2 = nil
		}
		var d net.Dialer
		conn, err := d.DialContext(ctx, "tcp", e.proxyAddr())
		if err != nil {
			return "", err
		}
		e.udpConn2 = conn
		if _, _, gerr := socks5Greeting(ctx, conn, e.cfg.UseAuth); gerr != nil {
			return "", gerr
		}
		if e.cfg.UseAuth {
			if _, aerr := socks5Auth(ctx, conn, e.cfg.ProxyLogin, e.cfg.ProxyPassword); aerr != nil {
				return "", aerr
			}
		}
		relay, _, uerr := socks5UDPAssociate(ctx, conn)
		if uerr != nil {
			return "", uerr
		}
		e.udpRelay = relay
		return fmt.Sprintf("relay %s:%d", relay.IP.String(), relay.Port), nil
	})
	e.udpOK = ok
}

// runStun — Test 6: STUN through the SOCKS5 UDP relay.
func (e *executor) runStun() {
	if e.shouldSkip("stun", e.udpOK) {
		return
	}

	host, portStr, splitErr := net.SplitHostPort(e.cfg.StunServer)
	if splitErr != nil {
		e.emit(Result{
			ID:      "stun",
			Status:  StatusFailed,
			Error:   fmt.Sprintf("bad StunServer %q: %s", e.cfg.StunServer, splitErr.Error()),
			Hint:    hintFor("stun", splitErr),
			Attempt: 1,
		})
		return
	}
	port64, perr := strconv.ParseUint(portStr, 10, 16)
	if perr != nil {
		e.emit(Result{
			ID:      "stun",
			Status:  StatusFailed,
			Error:   fmt.Sprintf("bad StunServer port %q: %s", portStr, perr.Error()),
			Hint:    hintFor("stun", perr),
			Attempt: 1,
		})
		return
	}
	stunPort := uint16(port64)

	e.runAttempt("stun", func(ctx context.Context) (string, error) {
		// Resolve STUN host to an IPv4. We don't support IPv6 STUN.
		ips, err := (&net.Resolver{}).LookupIP(ctx, "ip4", host)
		if err != nil {
			return "", fmt.Errorf("stun: lookup %s: %w", host, err)
		}
		var stunIP4 net.IP
		for _, ip := range ips {
			if v4 := ip.To4(); v4 != nil {
				stunIP4 = v4
				break
			}
		}
		if stunIP4 == nil {
			return "", errors.New("stun: no IPv4 for STUN server")
		}

		// Open a fresh local UDP socket per attempt.
		if e.udpClient != nil {
			_ = e.udpClient.Close()
			e.udpClient = nil
		}
		pc, err := net.ListenPacket("udp", ":0")
		if err != nil {
			return "", fmt.Errorf("stun: listen udp: %w", err)
		}
		e.udpClient = pc
		if dl, ok := ctx.Deadline(); ok {
			_ = pc.SetDeadline(dl)
		}

		// Build SOCKS5 UDP datagram: RSV(2)=0 FRAG=0 ATYP=01 IP(4) PORT(2) STUN(20)
		txID, err := NewTransactionID()
		if err != nil {
			return "", err
		}
		stunReq := EncodeBindingRequest(txID)
		dgram := make([]byte, 0, 10+len(stunReq))
		dgram = append(dgram, 0x00, 0x00, 0x00, 0x01)
		dgram = append(dgram, stunIP4...)
		var portBuf [2]byte
		binary.BigEndian.PutUint16(portBuf[:], stunPort)
		dgram = append(dgram, portBuf[:]...)
		dgram = append(dgram, stunReq...)

		start := time.Now()
		if _, werr := pc.WriteTo(dgram, e.udpRelay); werr != nil {
			return "", fmt.Errorf("stun: write to relay: %w", werr)
		}

		readBuf := make([]byte, 1500)
		n, _, rerr := pc.ReadFrom(readBuf)
		if rerr != nil {
			return "", fmt.Errorf("stun: read from relay: %w", rerr)
		}
		rtt := time.Since(start)

		if n < 10 {
			return "", fmt.Errorf("stun: relay reply too short (%d bytes)", n)
		}
		// Validate SOCKS5 UDP wrapper: RSV=00 00, FRAG=00, ATYP=01.
		if readBuf[0] != 0x00 || readBuf[1] != 0x00 || readBuf[2] != 0x00 {
			return "", fmt.Errorf("stun: bad SOCKS5 UDP header (raw=%x)", readBuf[:10])
		}
		// We sent IPv4, expect IPv4 reply.
		var hdrLen int
		switch readBuf[3] {
		case 0x01:
			hdrLen = 10
		case 0x04:
			hdrLen = 22
		case 0x03:
			if n < 5 {
				return "", fmt.Errorf("stun: truncated SOCKS5 UDP domain header")
			}
			hdrLen = 4 + 1 + int(readBuf[4]) + 2
		default:
			return "", fmt.Errorf("stun: unknown SOCKS5 UDP ATYP=0x%02X", readBuf[3])
		}
		if n < hdrLen {
			return "", fmt.Errorf("stun: relay reply truncated (%d < %d)", n, hdrLen)
		}
		stunReply := readBuf[hdrLen:n]

		_, _, perr := ParseBindingResponse(stunReply, txID)
		if perr != nil {
			return "", perr
		}
		return fmt.Sprintf("%dms RTT", rtt.Milliseconds()), nil
	})
}

// runAPI — Test 7: HTTP GET Discord API gateway URL through the proxy.
func (e *executor) runAPI() {
	if e.shouldSkip("api", e.connectOK) {
		return
	}

	e.runAttempt("api", func(ctx context.Context) (string, error) {
		transport := &http.Transport{
			DialContext: func(ctx context.Context, _network, addr string) (net.Conn, error) {
				return e.dialThroughProxy(ctx, addr)
			},
			TLSClientConfig:       &tls.Config{},
			DisableKeepAlives:     true,
			ResponseHeaderTimeout: e.cfg.PerTestTimeout,
		}
		client := &http.Client{
			Transport: transport,
			Timeout:   e.cfg.PerTestTimeout,
		}
		req, err := http.NewRequestWithContext(ctx, "GET", e.cfg.DiscordAPI, nil)
		if err != nil {
			return "", err
		}
		resp, err := client.Do(req)
		if err != nil {
			return "", err
		}
		defer resp.Body.Close()
		if resp.StatusCode == 200 || resp.StatusCode == 401 {
			return fmt.Sprintf("HTTP %d", resp.StatusCode), nil
		}
		return "", fmt.Errorf("api: HTTP %d", resp.StatusCode)
	})
}

// dialThroughProxy is the http.Transport.DialContext used by runAPI. It
// opens a TCP connection to the SOCKS5 proxy, performs greet+(auth)+CONNECT
// to addr, then returns the established conn.
func (e *executor) dialThroughProxy(ctx context.Context, addr string) (net.Conn, error) {
	host, portStr, err := net.SplitHostPort(addr)
	if err != nil {
		return nil, fmt.Errorf("api: split %q: %w", addr, err)
	}
	port64, err := strconv.ParseUint(portStr, 10, 16)
	if err != nil {
		return nil, fmt.Errorf("api: bad port %q: %w", portStr, err)
	}
	port := uint16(port64)

	var d net.Dialer
	conn, err := d.DialContext(ctx, "tcp", e.proxyAddr())
	if err != nil {
		return nil, err
	}
	if _, _, gerr := socks5Greeting(ctx, conn, e.cfg.UseAuth); gerr != nil {
		_ = conn.Close()
		return nil, gerr
	}
	if e.cfg.UseAuth {
		if _, aerr := socks5Auth(ctx, conn, e.cfg.ProxyLogin, e.cfg.ProxyPassword); aerr != nil {
			_ = conn.Close()
			return nil, aerr
		}
	}
	if _, cerr := socks5Connect(ctx, conn, host, port); cerr != nil {
		_ = conn.Close()
		return nil, cerr
	}
	// Clear the deadline socks5* primitives applied — http.Transport
	// manages timing past this point.
	_ = conn.SetDeadline(time.Time{})
	return conn, nil
}