incorporated s3 sync tool from external github repo

This commit is contained in:
Nicholas Wehr
2025-08-28 11:27:31 -07:00
parent e9dcff4015
commit bf51dc83e5
2 changed files with 215 additions and 0 deletions

View File

@ -0,0 +1,57 @@
# 🚀 S3Sync Runner
Fastest way to pull down evm block files from s3
This script automates syncing **massive S3 object stores** in a **safe, resumable, and time-tracked way**. The traditional `s3 sync` is just wayy to slow.
## Features
- ✅ Auto-installs [nidor1998/s3sync](https://github.com/nidor1998/s3sync) (latest release) into `~/.local/bin`
- ✅ Sequential per-prefix syncs (e.g., `21000000/`, `22000000/`, …)
- ✅ Per-prefix timing: `22000000 took 12 minutes!`
- ✅ Total runtime summary at the end
- ✅ Designed for **tiny files at scale** (EVM block archives)
- ✅ Zero-config bootstrap — just run the script
## Quick Start
```bash
chmod +x s3sync-runner.sh
./s3sync-runner.sh
```
> Skipping to relevant block section
```bash
./s3sync-runner.sh --start-at 30000000
```
The script will:
* Install or update s3sync into ~/.local/bin
* Discover top-level prefixes in your S3 bucket
* Sync them one at a time, printing elapsed minutes
## Configuration
Edit the top of s3sync-runner.sh if needed:
```bash
BUCKET="hl-testnet-evm-blocks" # could be hl-mainnet-evm-blocks
REGION="ap-northeast-1" # hardcoded bucket region
DEST="$HOME/evm-blocks-testnet" # local target directory (this is what nanoreth will look at)
WORKERS=512 # worker threads per sync (lotsa workers needs lotsa RAM)
```
## Example Output
```bash
[2025-08-20 20:01:02] START 21000000
[2025-08-20 20:13:15] 21000000 took 12 minutes!
[2025-08-20 20:13:15] START 22000000
[2025-08-20 20:26:40] 22000000 took 13 minutes!
[2025-08-20 20:26:40] ALL DONE in 25 minutes.
```
## Hackathon Context
This runner was built as part of the Hyperliquid DEX Hackathon to accelerate:
* ⛓️ Blockchain archive node ingestion
* 📂 EVM block dataset replication
* 🧩 DEX ecosystem data pipelines

View File

@ -0,0 +1,158 @@
#!/usr/bin/env bash
# @author Niko Wehr (wwwehr)
set -euo pipefail
# ---- config ----
BUCKET="hl-testnet-evm-blocks"
REGION="ap-northeast-1"
DEST="${HOME}/evm-blocks-testnet"
WORKERS=512
S3SYNC="${HOME}/.local/bin/s3sync"
START_AT="" # default: run all
# ----------------
# parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--start-at)
START_AT="$2"
shift 2
;;
*)
echo "Unknown arg: $1" >&2
exit 1
;;
esac
done
now(){ date +"%F %T"; }
log(){ printf '[%s] %s\n' "$(now)" "$*"; }
die(){ log "ERROR: $*"; exit 1; }
trap 'log "Signal received, exiting."; exit 2' INT TERM
need(){ command -v "$1" >/dev/null 2>&1 || die "missing dependency: $1"; }
install_s3sync_latest() {
need curl
GHAPI="https://api.github.com/repos/nidor1998/s3sync/releases/latest"
os="$(uname | tr '[:upper:]' '[:lower:]')"
arch_raw="$(uname -m)"
case "$arch_raw" in
x86_64|amd64) arch_tag="x86_64" ;;
aarch64|arm64) arch_tag="aarch64" ;;
*) die "unsupported arch: ${arch_raw}" ;;
esac
# Map OS → asset prefix
case "$os" in
linux) prefix="s3sync-linux-glibc2.28-${arch_tag}" ;;
darwin) prefix="s3sync-macos-${arch_tag}" ;;
msys*|mingw*|cygwin*|windows) prefix="s3sync-windows-${arch_tag}" ;;
*) die "unsupported OS: ${os}" ;;
esac
# Fetch latest release JSON (unauthenticated)
json="$(curl -fsSL "$GHAPI")" || die "failed to query GitHub API"
# Pick URLs for tarball and checksum
tar_url="$(printf '%s' "$json" | awk -F'"' '/browser_download_url/ {print $4}' | grep -F "${prefix}.tar.gz" | head -n1)"
sum_url="$(printf '%s' "$json" | awk -F'"' '/browser_download_url/ {print $4}' | grep -F "${prefix}.sha256" | head -n1)"
[[ -n "$tar_url" ]] || die "could not find asset for prefix: ${prefix}"
[[ -n "$sum_url" ]] || die "could not find checksum for prefix: ${prefix}"
mkdir -p "${HOME}/.local/bin"
tmpdir="$(mktemp -d)"; trap 'rm -rf "$tmpdir"' EXIT
tar_path="${tmpdir}/s3sync.tar.gz"
sum_path="${tmpdir}/s3sync.sha256"
log "Downloading: $tar_url"
curl -fL --retry 5 --retry-delay 1 -o "$tar_path" "$tar_url"
curl -fL --retry 5 --retry-delay 1 -o "$sum_path" "$sum_url"
# Verify checksum
want_sum="$(cut -d: -f2 <<<"$(sed -n 's/^sha256:\(.*\)$/\1/p' "$sum_path" | tr -d '[:space:]')" || true)"
[[ -n "$want_sum" ]] || want_sum="$(awk '{print $1}' "$sum_path" || true)"
[[ -n "$want_sum" ]] || die "could not parse checksum file"
got_sum="$(sha256sum "$tar_path" | awk '{print $1}')"
[[ "$want_sum" == "$got_sum" ]] || die "sha256 mismatch: want $want_sum got $got_sum"
# Extract and install
tar -xzf "$tar_path" -C "$tmpdir"
binpath="$(find "$tmpdir" -maxdepth 2 -type f -name 's3sync' | head -n1)"
[[ -x "$binpath" ]] || die "s3sync binary not found in archive"
chmod +x "$binpath"
mv -f "$binpath" "$S3SYNC"
log "s3sync installed at $S3SYNC"
}
# --- deps & install/update ---
need aws
install_s3sync_latest
[[ ":$PATH:" == *":$HOME/.local/bin:"* ]] || export PATH="$HOME/.local/bin:$PATH"
mkdir -p "$DEST"
# list prefixes
log "Listing top-level prefixes in s3://${BUCKET}/"
mapfile -t PREFIXES < <(
aws s3 ls "s3://${BUCKET}/" --region "$REGION" --request-payer requester \
| awk '/^ *PRE /{print $2}' | sed 's:/$::' | grep -E '^[0-9]+$' || true
)
((${#PREFIXES[@]})) || die "No prefixes found."
# mark initial status
declare -A RESULTS
if [[ ! -n "$START_AT" ]]; then
skipping=0
else
skipping=1
fi
for p in "${PREFIXES[@]}"; do
if [[ -n "$START_AT" && "$p" == "$START_AT" ]]; then
skipping=0
fi
if (( skipping )); then
RESULTS["$p"]="-- SKIPPED"
else
RESULTS["$p"]="-- TODO"
fi
done
total_start=$(date +%s)
for p in "${PREFIXES[@]}"; do
if [[ "${RESULTS[$p]}" == "-- SKIPPED" ]]; then
continue
fi
src="s3://${BUCKET}/${p}/"
dst="${DEST}/${p}/"
mkdir -p "$dst"
log "START ${p}"
start=$(date +%s)
"$S3SYNC" \
--source-request-payer \
--source-region "$REGION" \
--worker-size "$WORKERS" \
--max-parallel-uploads "$WORKERS" \
"$src" "$dst"
end=$(date +%s)
mins=$(( (end - start + 59) / 60 ))
RESULTS["$p"]="$mins minutes"
# Print status table so far
echo "---- Status ----"
for k in "${PREFIXES[@]}"; do
echo "$k ${RESULTS[$k]}"
done
echo "----------------"
done
total_end=$(date +%s)
total_mins=$(( (total_end - total_start + 59) / 60 ))
echo "ALL DONE in $total_mins minutes."