From 83ba775971f16b8e3a820d420ee76043dc007b6e Mon Sep 17 00:00:00 2001 From: Stepan Usatiuk Date: Sun, 12 Oct 2025 13:51:50 +0200 Subject: [PATCH] some stuff --- Firmware/AGENTS.md | 4 + Firmware/ghettoprof.sh | 248 +++++++++++++++++++++++++++++++++ Firmware/main/src/app_main.cpp | 72 +++++----- 3 files changed, 291 insertions(+), 33 deletions(-) create mode 100644 Firmware/AGENTS.md create mode 100755 Firmware/ghettoprof.sh diff --git a/Firmware/AGENTS.md b/Firmware/AGENTS.md new file mode 100644 index 0000000..a010379 --- /dev/null +++ b/Firmware/AGENTS.md @@ -0,0 +1,4 @@ +To build: +(in zsh) +. "$HOME/esp/esp-idf/export.sh" +idf.py build \ No newline at end of file diff --git a/Firmware/ghettoprof.sh b/Firmware/ghettoprof.sh new file mode 100755 index 0000000..1d7f5f3 --- /dev/null +++ b/Firmware/ghettoprof.sh @@ -0,0 +1,248 @@ +#!/usr/bin/env bash +# parallel-pc-profile.sh — parallel symbol resolver + optional annotated disassembly +# Supports C++ demangling, LLVM disassembler, and optional no-inlines aggregation (symbol-table based). +# +# Usage: +# ./parallel-pc-profile.sh [-j jobs] [--annotate] [--no-inlines] firmware.elf pcs.txt + +set -euo pipefail + +usage() { + echo "Usage: $0 [-j jobs] [--annotate] [--no-inlines] firmware.elf pcs.txt" + exit 1 +} + +ANNOTATE=0 +JOBS="" +NO_INLINES=0 + +# ---- args ---- +while [[ $# -gt 0 ]]; do + case "$1" in + -j) JOBS="$2"; shift 2 ;; + --annotate) ANNOTATE=1; shift ;; + --no-inlines) NO_INLINES=1; shift ;; + -h|--help) usage ;; + *) break ;; + esac +done +[[ $# -lt 2 ]] && usage +ELF="$1" +PCS_IN="$2" + +[[ ! -f "$ELF" ]] && { echo "ELF not found: $ELF" >&2; exit 2; } +[[ ! -f "$PCS_IN" ]] && { echo "PC log not found: $PCS_IN" >&2; exit 3; } + +# ---- tools ---- +ADDR2LINE="" +for t in llvm-addr2line eu-addr2line riscv32-esp-elf-addr2line xtensa-esp32-elf-addr2line addr2line; do + if command -v "$t" >/dev/null 2>&1; then ADDR2LINE="$t"; break; fi +done +[[ -z "$ADDR2LINE" ]] && { echo "No addr2line found"; exit 4; } + +if command -v llvm-objdump >/dev/null 2>&1; then + OBJDUMP="llvm-objdump" +else + for t in riscv32-esp-elf-objdump xtensa-esp32-elf-objdump objdump; do + if command -v "$t" >/dev/null 2>&1; then OBJDUMP="$t"; break; fi + done +fi +[[ -z "${OBJDUMP:-}" ]] && { echo "No objdump found"; exit 5; } + +if command -v llvm-nm >/dev/null 2>&1; then + NM="llvm-nm" +elif command -v nm >/dev/null 2>&1; then + NM="nm" +else + NM="" +fi + +if command -v c++filt >/dev/null 2>&1; then + CPPFILT="c++filt" +elif command -v llvm-cxxfilt >/dev/null 2>&1; then + CPPFILT="llvm-cxxfilt" +else + CPPFILT="" +fi + +# ---- cores ---- +if [[ -z "$JOBS" ]]; then + if command -v nproc >/dev/null 2>&1; then JOBS=$(nproc) + elif [[ "$OSTYPE" == "darwin"* ]]; then JOBS=$(sysctl -n hw.ncpu 2>/dev/null || echo 4) + else JOBS=$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4) + fi +fi +(( JOBS = JOBS > 1 ? JOBS - 1 : 1 )) +echo ">> Using $JOBS parallel jobs" + +TMP=$(mktemp -d) +trap 'rm -rf "$TMP"' EXIT + +# ---- extract PCs ---- +grep -aoE '0x[0-9a-fA-F]+' "$PCS_IN" | tr 'A-F' 'a-f' | sort | uniq -c >"$TMP/pc_counts.txt" || true +awk '{print $2}' "$TMP/pc_counts.txt" >"$TMP/addrs.txt" +[[ ! -s "$TMP/addrs.txt" ]] && { echo "No addresses found"; exit 5; } + +# ---- parallel addr2line (live PC -> function to stderr) ---- +CHUNK=400 +split -l "$CHUNK" "$TMP/addrs.txt" "$TMP/chunk." + +find "$TMP" -name 'chunk.*' -type f -print0 \ +| xargs -0 -I{} -n1 -P "$JOBS" bash -c ' + set -euo pipefail + ADDR2LINE="$1"; ELF="$2"; CHUNK="$3"; CPP="$4" + OUT="${CHUNK}.sym" + "$ADDR2LINE" -a -f -e "$ELF" $(cat "$CHUNK") \ + | tee "$OUT" \ + | awk '"'"'NR%3==1{a=$0;next} NR%3==2{f=$0; printf "%s\t%s\n",a,f; next} NR%3==0{next}'"'"' \ + | { if [[ -n "$CPP" ]]; then "$CPP"; else cat; fi; } 1>&2 +' _ "$ADDR2LINE" "$ELF" {} "$CPPFILT" + +# Collate triplets +cat "$TMP"/chunk.*.sym > "$TMP/symbols.raw" + +# ---- parse 3-line addr/func/file:line ---- +# Normalize leading zeros in addresses so joins match grep-extracted PCs +awk 'NR%3==1{a=$0; sub(/^0x0+/, "0x", a); next} NR%3==2{f=$0; next} NR%3==0{print a "\t" f "\t" $0}' \ + "$TMP/symbols.raw" >"$TMP/map.tsv" + +# ---- counts: addr -> samplecount ---- +awk '{printf "%s\t%s\n",$2,$1}' "$TMP/pc_counts.txt" | sort -k1,1 >"$TMP/counts.tsv" + +# ---- choose mapping: default (addr2line; may show inlined names) vs --no-inlines (symbol-table) ---- +DEFAULT_ADDR_FUNC="$TMP/addr_func.tsv" +cut -f1,2 "$TMP/map.tsv" | sort -k1,1 >"$DEFAULT_ADDR_FUNC" + +if [[ "$NO_INLINES" == "1" ]]; then + if [[ -z "$NM" ]]; then + echo "WARNING: nm/llvm-nm not found; falling back to inline-aware mapping." >&2 + ADDR_FUNC_FILE="$DEFAULT_ADDR_FUNC" + else + echo ">> Building symbol table for no-inlines mapping..." + # Create sorted function symbols: hexaddr\tname (demangled if possible afterwards) + # Try llvm-nm format first; fall back to generic nm. + if [[ "$NM" == "llvm-nm" ]]; then + # llvm-nm -n --defined-only emits: ADDRESS TYPE NAME + "$NM" -n --defined-only "$ELF" \ + | awk '/ [Tt] /{print $1 "\t" $3}' > "$TMP/syms.raw" + else + # generic nm -n emits: ADDRESS TYPE NAME (varies a bit across platforms) + "$NM" -n --defined-only "$ELF" 2>/dev/null \ + | awk '/ [Tt] /{print $1 "\t" $3}' > "$TMP/syms.raw" || true + # macOS nm might output different columns; handle common alt layout: + if [[ ! -s "$TMP/syms.raw" ]]; then + "$NM" -n "$ELF" 2>/dev/null | awk '/ [Tt] /{print $1 "\t" $3}' > "$TMP/syms.raw" || true + fi + fi + + if [[ -n "$CPPFILT" && -s "$TMP/syms.raw" ]]; then + "$CPPFILT" < "$TMP/syms.raw" > "$TMP/syms.dem.raw" || cp "$TMP/syms.raw" "$TMP/syms.dem.raw" + else + cp "$TMP/syms.raw" "$TMP/syms.dem.raw" + fi + + # Normalize addresses and sort ascending + awk '{addr=$1; sub(/^0x0+/, "0x", addr); print addr "\t" $2}' "$TMP/syms.dem.raw" \ + | awk 'NF' \ + | sort -k1,1 > "$TMP/syms.tsv" + + if [[ ! -s "$TMP/syms.tsv" ]]; then + echo "WARNING: no text symbols found; falling back to inline-aware mapping." >&2 + ADDR_FUNC_FILE="$DEFAULT_ADDR_FUNC" + else + # Map each PC to the *containing* function: last symbol with addr <= PC. + # Both syms.tsv and addrs.txt are sorted asc → single pass merge. + awk ' + function hex2num(h, x, n,i,c) { + gsub(/^0x/,"",h); n=0 + for(i=1;i<=length(h);i++){ c=substr(h,i,1) + x = index("0123456789abcdef", tolower(c)) - 1 + if (x<0) x = index("0123456789ABCDEF", c) - 1 + n = n*16 + x + } + return n + } + BEGIN { + # preload symbols + while ((getline < ARGV[1]) > 0) { + saddr[NSYM]=$1; sname[NSYM]=$2; NSYM++ + } + # load PCs + while ((getline < ARGV[2]) > 0) { + pc[NPC]=$0; NPC++ + } + # pointers + si=0 + for (i=0; i sname[si] (if any) + if (si\n", p + } + exit 0 + } + ' "$TMP/syms.tsv" "$TMP/addrs.txt" \ + | sort -k1,1 > "$TMP/addr_func.noinline.tsv" + + ADDR_FUNC_FILE="$TMP/addr_func.noinline.tsv" + fi + fi +else + ADDR_FUNC_FILE="$DEFAULT_ADDR_FUNC" +fi + +# ---- aggregate to hot functions ---- +join -t $'\t' -a1 -e "" -o 1.2,2.2 "$TMP/counts.tsv" "$ADDR_FUNC_FILE" \ +| awk -F'\t' '{s[$2]+=$1} END{for(k in s) printf "%8d %s\n",s[k],k}' \ +| sort -nr > "$TMP/hot.txt" + +# ---- demangle final hot list (if available) ---- +if [[ -n "$CPPFILT" ]]; then + "$CPPFILT" < "$TMP/hot.txt" > hot_functions.txt +else + cp "$TMP/hot.txt" hot_functions.txt +fi + +echo "=== Top 50 hot functions ===" +head -50 hot_functions.txt +echo "Full list in: hot_functions.txt" + +# ---- annotated source+assembly (optional) ---- +if (( ANNOTATE )); then + echo ">> Generating annotated source+assembly..." + awk '{printf "%s %s\n",$2,$1}' "$TMP/pc_counts.txt" >"$TMP/count.map" + + if [[ "$OBJDUMP" == "llvm-objdump" ]]; then + # Portable across llvm-objdump versions + "$OBJDUMP" --source -l --demangle -d "$ELF" >"$TMP/disasm.txt" + else + "$OBJDUMP" -S -C -l -d "$ELF" >"$TMP/disasm.txt" + fi + + # Overlay hit counts onto the disassembly + awk -v counts="$TMP/count.map" ' + BEGIN { + while ((getline < counts) > 0) { + addr=$1; cnt=$2 + gsub(/^0x/,"",addr) + map[addr]=cnt + } + } + /^[[:space:]]*[0-9a-f]+:/ { + split($1,a,":"); key=a[1] + if (key in map) + printf("%-12s %6d | %s\n", $1, map[key], substr($0, index($0,$2))) + else + print $0 + next + } + { print } + ' "$TMP/disasm.txt" > annotated.S + + echo "Annotated source + assembly written to: annotated.S" + echo "Tip: less -R annotated.S" +fi diff --git a/Firmware/main/src/app_main.cpp b/Firmware/main/src/app_main.cpp index d301005..eab8620 100644 --- a/Firmware/main/src/app_main.cpp +++ b/Firmware/main/src/app_main.cpp @@ -1,17 +1,18 @@ // Cardboy firmware entry point: boot platform services and run the modular app system. -#include "cardboy/backend/esp_backend.hpp" #include "cardboy/apps/clock_app.hpp" #include "cardboy/apps/gameboy_app.hpp" #include "cardboy/apps/menu_app.hpp" #include "cardboy/apps/tetris_app.hpp" +#include "cardboy/backend/esp_backend.hpp" #include "cardboy/sdk/app_system.hpp" -#include "freertos/FreeRTOS.h" -#include "freertos/task.h" #include "esp_err.h" #include "esp_pm.h" #include "esp_sleep.h" +#include "esp_system.h" +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" #include "sdkconfig.h" #include @@ -19,9 +20,9 @@ #include #include #include +#include #include #include -#include #include namespace { @@ -53,11 +54,11 @@ constexpr apps::EmbeddedRomDescriptor kEmbeddedRoms[] = { #if CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS && CONFIG_FREERTOS_USE_TRACE_FACILITY namespace { -constexpr TickType_t kStatsTaskDelayTicks = pdMS_TO_TICKS(5000); -constexpr TickType_t kStatsWarmupDelay = pdMS_TO_TICKS(2000); -constexpr UBaseType_t kStatsTaskPriority = tskIDLE_PRIORITY + 1; -constexpr uint32_t kStatsTaskStack = 4096; -constexpr char kStatsTaskName[] = "TaskStats"; +constexpr TickType_t kStatsTaskDelayTicks = pdMS_TO_TICKS(5000); +constexpr TickType_t kStatsWarmupDelay = pdMS_TO_TICKS(2000); +constexpr UBaseType_t kStatsTaskPriority = tskIDLE_PRIORITY + 1; +constexpr uint32_t kStatsTaskStack = 4096; +constexpr char kStatsTaskName[] = "TaskStats"; struct TaskRuntimeSample { TaskHandle_t handle; @@ -65,11 +66,11 @@ struct TaskRuntimeSample { }; struct TaskUsageRow { - std::string name; - uint64_t delta; - UBaseType_t priority; - uint32_t stackHighWaterBytes; - bool isIdle; + std::string name; + uint64_t delta; + UBaseType_t priority; + uint32_t stackHighWaterBytes; + bool isIdle; }; [[nodiscard]] uint64_t deltaWithWrap(uint32_t current, uint32_t previous) { @@ -79,7 +80,7 @@ struct TaskUsageRow { } void task_usage_monitor(void*) { - static constexpr char tag[] = "TaskUsage"; + static constexpr char tag[] = "TaskUsage"; std::vector lastSamples; uint32_t lastTotal = 0; @@ -94,7 +95,7 @@ void task_usage_monitor(void*) { std::vector statusBuffer(taskCount); uint32_t totalRuntime = 0; - const UBaseType_t captured = uxTaskGetSystemState(statusBuffer.data(), statusBuffer.size(), &totalRuntime); + const UBaseType_t captured = uxTaskGetSystemState(statusBuffer.data(), statusBuffer.size(), &totalRuntime); if (captured == 0) continue; statusBuffer.resize(captured); @@ -118,8 +119,8 @@ void task_usage_monitor(void*) { std::vector rows; rows.reserve(statusBuffer.size()); - uint64_t idleDelta = 0; - uint64_t activeDelta = 0; + uint64_t idleDelta = 0; + uint64_t activeDelta = 0; uint64_t accountedDelta = 0; for (const auto& status: statusBuffer) { @@ -128,18 +129,18 @@ void task_usage_monitor(void*) { }); const uint32_t previousRuntime = (it != lastSamples.end()) ? it->runtime : status.ulRunTimeCounter; - const uint64_t taskDelta = (it != lastSamples.end()) ? deltaWithWrap(status.ulRunTimeCounter, previousRuntime) : 0ULL; + const uint64_t taskDelta = + (it != lastSamples.end()) ? deltaWithWrap(status.ulRunTimeCounter, previousRuntime) : 0ULL; currentSamples.push_back({status.xHandle, status.ulRunTimeCounter}); - TaskUsageRow row{ - .name = std::string(status.pcTaskName ? status.pcTaskName : ""), - .delta = taskDelta, - .priority = status.uxCurrentPriority, - .stackHighWaterBytes = static_cast(status.usStackHighWaterMark) * sizeof(StackType_t), - .isIdle = status.uxCurrentPriority == tskIDLE_PRIORITY || - (status.pcTaskName && std::strncmp(status.pcTaskName, "IDLE", 4) == 0) - }; + TaskUsageRow row{.name = std::string(status.pcTaskName ? status.pcTaskName : ""), + .delta = taskDelta, + .priority = status.uxCurrentPriority, + .stackHighWaterBytes = + static_cast(status.usStackHighWaterMark) * sizeof(StackType_t), + .isIdle = status.uxCurrentPriority == tskIDLE_PRIORITY || + (status.pcTaskName && std::strncmp(status.pcTaskName, "IDLE", 4) == 0)}; rows.push_back(std::move(row)); @@ -156,9 +157,8 @@ void task_usage_monitor(void*) { if (rows.empty()) continue; - std::sort(rows.begin(), rows.end(), [](const TaskUsageRow& a, const TaskUsageRow& b) { - return a.delta > b.delta; - }); + std::sort(rows.begin(), rows.end(), + [](const TaskUsageRow& a, const TaskUsageRow& b) { return a.delta > b.delta; }); const double windowMs = static_cast(totalDelta) / 1000.0; @@ -181,14 +181,20 @@ void task_usage_monitor(void*) { std::printf(" %-16s %6.2f%% (ISRs / scheduler)\n", "", residualPct); } - std::printf("[%s] Active %.2f%% | Idle %.2f%%\n", tag, - (activeDelta * 100.0) / static_cast(totalDelta), idlePct); + std::printf("[%s] Active %.2f%% | Idle %.2f%%\n", tag, (activeDelta * 100.0) / static_cast(totalDelta), + idlePct); + + const uint32_t heapFree = esp_get_free_heap_size(); + const uint32_t heapMinimum = esp_get_minimum_free_heap_size(); + std::printf("[%s] Heap free %lu B | Min free %lu B\n", tag, static_cast(heapFree), + static_cast(heapMinimum)); std::fflush(stdout); } } void start_task_usage_monitor() { - xTaskCreatePinnedToCore(task_usage_monitor, kStatsTaskName, kStatsTaskStack, nullptr, kStatsTaskPriority, nullptr, 0); + xTaskCreatePinnedToCore(task_usage_monitor, kStatsTaskName, kStatsTaskStack, nullptr, kStatsTaskPriority, nullptr, + 0); } } // namespace