#!/bin/sh
# dockerd-rootless.sh executes dockerd in rootless mode.
#
# Usage: dockerd-rootless.sh [DOCKERD_OPTIONS]
#
# External dependencies:
# * newuidmap and newgidmap needs to be installed.
# * /etc/subuid and /etc/subgid needs to be configured for the current user.
#
# Recognized environment variables:
# * DOCKERD_ROOTLESS_ROOTLESSKIT_STATE_DIR=DIR: the rootlesskit state dir.
#   * Defaults to "$XDG_RUNTIME_DIR/dockerd-rootless".
# * DOCKERD_ROOTLESS_ROOTLESSKIT_NET=(slirp4netns|vpnkit|pasta|gvisor-tap-vsock|lxc-user-nic): the rootlesskit network driver.
#   * Defaults to "slirp4netns" if slirp4netns (>= v0.4.0) is installed, else "pasta", else "vpnkit", else "gvisor-tap-vsock".
# * DOCKERD_ROOTLESS_ROOTLESSKIT_MTU=NUM: the MTU value for the rootlesskit network driver.
#   * Defaults to 65520 for slirp4netns, pasta, and gvisor-tap-vsock. Defaults to 1500 for other rootlesskit network drivers.
# * DOCKERD_ROOTLESS_ROOTLESSKIT_PORT_DRIVER=(builtin|slirp4netns|implicit|gvisor-tap-vsock): the rootlesskit port driver.
#   * Defaults to "implicit" for "pasta", "builtin" for other rootlesskit network drivers.
# * DOCKERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SANDBOX=(auto|true|false): whether to protect slirp4netns with a dedicated mount namespace.
#   * Defaults to "auto".
# * DOCKERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SECCOMP=(auto|true|false): whether to protect slirp4netns with seccomp.
#   * Defaults to "auto".
# * DOCKERD_ROOTLESS_ROOTLESSKIT_DISABLE_HOST_LOOPBACK=(true|false): prohibit connections to 127.0.0.1 on the host (including via 10.0.2.2, in the case of slirp4netns).
#   * Defaults to "true".
# * DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS=(true|false): whether to launch rootlesskit with the "detach-netns" mode.
#   The "detached-netns" mode accelerates `docker (pull|push|build)` and enables `docker run --net=host`
#   Defaults to "true". Set this to false only when facing a compatibility issue.

# To apply an environment variable via systemd, create ~/.config/systemd/user/docker.service.d/override.conf as follows,
# and run `systemctl --user daemon-reload && systemctl --user restart docker`:
# --- BEGIN ---
# [Service]
# Environment="DOCKERD_ROOTLESS_ROOTLESSKIT_NET=pasta"
# Environment="DOCKERD_ROOTLESS_ROOTLESSKIT_PORT_DRIVER=implicit"
# ---  END  ---

# Guide to choose the network driver and the port driver:
#
#  Network driver   | Port driver      | Net throughput | Port throughput | Src IP | No SUID | Note
#  -----------------|------------------|----------------|-----------------|--------|---------|---------------------------------------------------------
#  gvisor-tap-vsock | builtin          | Slow           | Fast ✅         | ✅ (*) | ✅      | Default when slirp4netns is not installed
#  slirp4netns      | builtin          | Slow           | Fast ✅         | ✅ (*) | ✅      | Default when slirp4netns is installed
#  vpnkit           | builtin          | Slow           | Fast ✅         | ✅ (*) | ✅      | Legacy
#  gvisor-tap-vsock | gvisor-tap-vsock | Slow           | Slow            | ❌     | ✅      | Not recommended. Use `builtin` port driver instead.
#  slirp4netns      | slirp4netns      | Slow           | Slow            | ✅     | ✅      |
#  pasta            | implicit         | Slow           | Fast ✅         | ✅     | ✅      | Experimental; Needs recent version of pasta (2023_12_04)
#  lxc-user-nic     | builtin          | Fast ✅        | Fast ✅         | ✅ (*) | ❌      | Experimental
#  (bypass4netns)   | (bypass4netns)   | Fast ✅        | Fast ✅         | ✅     | ✅      | (Not integrated to RootlessKit)
#
# (*) Applicable since RootlessKit v3.0. Also requires userland-proxy to be disabled.

# See the documentation for the further information: https://docs.docker.com/go/rootless/

set -e -x
case "$1" in
	"check" | "install" | "uninstall")
		echo "Did you mean 'dockerd-rootless-setuptool.sh $@' ?"
		exit 1
		;;
esac
if ! [ -w "$XDG_RUNTIME_DIR" ]; then
	echo "XDG_RUNTIME_DIR needs to be set and writable"
	exit 1
fi
if ! [ -d "$HOME" ]; then
	echo "HOME needs to be set and exist."
	exit 1
fi

mount_directory() {
	if [ -z "$_DOCKERD_ROOTLESS_CHILD" ]; then
		echo "mount_directory should be called from the child context. Otherwise data loss is at risk" >&2
		exit 1
	fi

	DIRECTORY="$1"
	if [ ! -d "$DIRECTORY" ]; then
		return
	fi

	# Bind mount directory: this makes this directory visible to
	# Dockerd, even if it is originally a symlink, given Dockerd does
	# not always follow symlinks. Some directories might also be
	# "copied-up", meaning that they will also be writable on the child
	# namespace; this will be the case only if they are provided as
	# --copy-up to the rootlesskit.
	DIRECTORY_REALPATH=$(realpath "$DIRECTORY")
	MOUNT_OPTIONS="${2:---bind}"
	rm -rf "$DIRECTORY"
	mkdir -p "$DIRECTORY"
	mount $MOUNT_OPTIONS "$DIRECTORY_REALPATH" "$DIRECTORY"
}

rootlesskit=""
for f in docker-rootlesskit rootlesskit; do
	if command -v $f > /dev/null 2>&1; then
		rootlesskit=$f
		break
	fi
done
if [ -z "$rootlesskit" ]; then
	echo "rootlesskit needs to be installed"
	exit 1
fi

: "${CONTAINERD_ROOTLESS_ROOTLESSKIT_STATE_DIR:=$XDG_RUNTIME_DIR/containerd-rootless}"
if [ -e "$CONTAINERD_ROOTLESS_ROOTLESSKIT_STATE_DIR" ]; then
	# https://github.com/moby/moby/issues/52171
	echo "dockerd-rootless.sh conflicts with containerd-rootless.sh. Stop containerd-rootless.sh if it's running, and remove $CONTAINERD_ROOTLESS_ROOTLESSKIT_STATE_DIR if it still exists."
	exit 1
fi

: "${DOCKERD_ROOTLESS_ROOTLESSKIT_STATE_DIR:=$XDG_RUNTIME_DIR/dockerd-rootless}"
: "${DOCKERD_ROOTLESS_ROOTLESSKIT_NET:=}"
: "${DOCKERD_ROOTLESS_ROOTLESSKIT_MTU:=}"
: "${DOCKERD_ROOTLESS_ROOTLESSKIT_PORT_DRIVER:=}"
: "${DOCKERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SANDBOX:=auto}"
: "${DOCKERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SECCOMP:=auto}"
: "${DOCKERD_ROOTLESS_ROOTLESSKIT_DISABLE_HOST_LOOPBACK:=}"
: "${DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS:=true}"
net=$DOCKERD_ROOTLESS_ROOTLESSKIT_NET
port_driver=$DOCKERD_ROOTLESS_ROOTLESSKIT_PORT_DRIVER
mtu=$DOCKERD_ROOTLESS_ROOTLESSKIT_MTU
if [ -z "$net" ]; then
	if command -v slirp4netns > /dev/null 2>&1; then
		# If --netns-type is present in --help, slirp4netns is >= v0.4.0.
		if slirp4netns --help | grep -qw -- --netns-type; then
			net=slirp4netns
		else
			echo "slirp4netns found but seems older than v0.4.0. Checking for other network drivers."
		fi
	fi
	if [ -z "$net" ]; then
		if command -v pasta > /dev/null 2>&1; then
			net=pasta
		fi
	fi
	if [ -z "$net" ]; then
		if command -v vpnkit > /dev/null 2>&1; then
			net=vpnkit
		fi
	fi
	if [ -z "$net" ]; then
		net=gvisor-tap-vsock
	fi
fi
if [ "$net" = host ]; then
	echo "Unsupported RootlessKit network driver: $net"
	exit 1
fi
if [ -z "$mtu" ]; then
	case "$net" in
		slirp4netns | pasta | gvisor-tap-vsock)
			mtu=65520
			;;
		*)
			mtu=1500
			;;
	esac
fi
if [ -z "$port_driver" ]; then
	if [ "$net" = pasta ]; then
		port_driver=implicit
	else
		port_driver=builtin
	fi
fi

host_loopback="--disable-host-loopback"
if [ "$DOCKERD_ROOTLESS_ROOTLESSKIT_DISABLE_HOST_LOOPBACK" = "false" ]; then
	host_loopback=""
fi

dockerd="${DOCKERD:-dockerd}"

if [ -z "$_DOCKERD_ROOTLESS_CHILD" ]; then
	_DOCKERD_ROOTLESS_CHILD=1
	export _DOCKERD_ROOTLESS_CHILD
	if [ "$(id -u)" = "0" ]; then
		echo "This script must be executed as a non-privileged user"
		exit 1
	fi
	# `selinuxenabled` always returns false in RootlessKit child, so we execute `selinuxenabled` in the parent.
	# https://github.com/rootless-containers/rootlesskit/issues/94
	if command -v selinuxenabled > /dev/null 2>&1 && selinuxenabled; then
		_DOCKERD_ROOTLESS_SELINUX=1
		export _DOCKERD_ROOTLESS_SELINUX
	fi

	case "$DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS" in
		1 | true)
			DOCKERD_ROOTLESS_ROOTLESSKIT_FLAGS="--detach-netns $DOCKERD_ROOTLESS_ROOTLESSKIT_FLAGS"
			;;
		0 | false)
			# NOP
			;;
		*)
			echo "Unknown DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS value: $DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS"
			exit 1
			;;
	esac

	# Re-exec the script via RootlessKit, so as to create unprivileged {user,mount,network} namespaces.
	#
	# --copy-up allows removing/creating files in the directories by creating tmpfs and symlinks
	# * /etc: copy-up is required so as to prevent `/etc/resolv.conf` in the
	#         namespace from being unexpectedly unmounted when `/etc/resolv.conf` is recreated on the host
	#         (by either systemd-networkd or NetworkManager)
	# * /run: copy-up is required so that we can create /run/docker (hardcoded for plugins) in our namespace
	exec $rootlesskit \
		--state-dir=$DOCKERD_ROOTLESS_ROOTLESSKIT_STATE_DIR \
		--net=$net --mtu=$mtu \
		--slirp4netns-sandbox=$DOCKERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SANDBOX \
		--slirp4netns-seccomp=$DOCKERD_ROOTLESS_ROOTLESSKIT_SLIRP4NETNS_SECCOMP \
		$host_loopback --port-driver=$port_driver \
		--copy-up=/etc --copy-up=/run \
		--propagation=rslave \
		$DOCKERD_ROOTLESS_ROOTLESSKIT_FLAGS \
		"$0" "$@"
else
	[ "$_DOCKERD_ROOTLESS_CHILD" = 1 ]

	# remove the symlinks for the existing files in the parent namespace if any,
	# so that we can create our own files in our mount namespace.
	rm -f /run/docker /run/containerd /run/xtables.lock

	if [ -n "$_DOCKERD_ROOTLESS_SELINUX" ]; then
		# iptables requires /run in the child to be relabeled. The actual /run in the parent is unaffected.
		# https://github.com/containers/podman/blob/e6fc34b71aa9d876b1218efe90e14f8b912b0603/libpod/networking_linux.go#L396-L401
		# https://github.com/moby/moby/issues/41230
		chcon system_u:object_r:iptables_var_run_t:s0 /run
	fi

	if [ "$(stat -c %T -f /etc)" = "tmpfs" ] && [ -L "/etc/ssl" ]; then
		# Workaround for "x509: certificate signed by unknown authority" on openSUSE Tumbleweed.
		# https://github.com/rootless-containers/rootlesskit/issues/225
		mount_directory /etc/ssl "--rbind"
	fi

	netns="/proc/self/ns/net"
	case "$DOCKERD_ROOTLESS_ROOTLESSKIT_DETACH_NETNS" in
		1 | true)
			netns="$ROOTLESSKIT_STATE_DIR/netns"
			;;
	esac
	# When running with --firewall-backend=nftables, IP forwarding needs to be enabled
	# because the daemon won't enable it. IP forwarding is harmless in the rootless
	# netns, there's only a single external interface and only Docker uses the netns.
	# So, always enable IPv4 and IPv6 forwarding. But ignore failure to enable IPv6
	# forwarding, for hosts with IPv6 disabled.
	nsenter -n"$netns" sysctl -w net.ipv4.ip_forward=1
	nsenter -n"$netns" sysctl -w net.ipv6.conf.all.forwarding=1 || true

	exec "$dockerd" "$@"
fi
