diff --git a/.gitignore b/.gitignore index a710ac0eed..d34fab5e92 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ results/ dependencies/jdk-8u221-linux-x64.tar.gz dependencies/cmake-3.15.2/ dependencies/jdk1.8.0_221/ +dependencies/sysint_install/ kernel/kbuild/* *.o tpcc-sqlite/database/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..3857e9a4e6 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "dependencies/syscall_intercept"] + path = dependencies/syscall_intercept + url = https://github.com/pmem/syscall_intercept.git diff --git a/dependencies/splitfs_deps.sh b/dependencies/splitfs_deps.sh index 4879c6cf28..8ecbc1149d 100755 --- a/dependencies/splitfs_deps.sh +++ b/dependencies/splitfs_deps.sh @@ -1,4 +1,27 @@ #!/bin/bash # install boost -sudo apt-get install libboost-dev +sudo apt-get install libboost-dev libcapstone-dev cmake pandoc clang + +# install syscall_intercept +git submodule init +git submodule update +syscall_dir=$PWD/syscall_intercept +install_dir=$PWD/sysint_install + +mkdir $install_dir +cd $install_dir +cmake $syscall_dir -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=clang +if [ "$?" -ne 0 ]; then + echo "Failed to build syscall_intercept" + exit 1 +fi + +make +if [ "$?" -ne 0 ]; then + echo "Failed to build syscall_intercept" + exit 1 +fi + +sudo make install +rm -rf $install_dir diff --git a/dependencies/syscall_intercept b/dependencies/syscall_intercept new file mode 160000 index 0000000000..304404581c --- /dev/null +++ b/dependencies/syscall_intercept @@ -0,0 +1 @@ +Subproject commit 304404581c57d43478438d175099d20260bae74e diff --git a/splitfs_syscall_intercept/.gitignore b/splitfs_syscall_intercept/.gitignore new file mode 100755 index 0000000000..69d73c391e --- /dev/null +++ b/splitfs_syscall_intercept/.gitignore @@ -0,0 +1,32 @@ +# Object files +*.o +*.ko +*.obj +*.elf + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +#*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Others +*~ diff --git a/splitfs_syscall_intercept/README.md b/splitfs_syscall_intercept/README.md new file mode 100644 index 0000000000..9387ca40bc --- /dev/null +++ b/splitfs_syscall_intercept/README.md @@ -0,0 +1,21 @@ +# SplitFS implementation using syscall intercept Library +[syscall_intercept](https://google.com) is a library that can be used to intercept the system calls. This works by rewriting the machine code in text area of the loaded program. +The prior implementation intercepts the `libc` calls. +This implementation does not make any changes to the logic of SplitFS itself. + +###Currently supported applications +1. PJD Test Suite (Tests run successfully) + +###How to use? +All paths (not starting with `/`) are relative to root of repository +1. `cd splitfs_syscall_intercept/src` +2. `make clean && make` +3. A file called `libnvp.so` will be created at `splitfs_syscall_intercept/src` -- this is the library file that needs to be used with `LD_PRELOAD`. +4. Run the application that you want using `LD_PRELOAD=splitfs_syscall_intercept/src/libnvp.so` \ + +###How to run the PJD test suite? +All paths (not starting with `/`) are relative to root of repository +1. Set up the pmem file mount at `/mnt/pmem_emul` +2. Make sure the mount point has `write` and `execute` permissions so that all users can delete files. (This is required since some tests use `setuid` to switch to a different user. This user will then not have permission to delete the staging files during exit cleanup) +3. `cd tests` +4. `make all_sysint` diff --git a/splitfs_syscall_intercept/include/debug.h b/splitfs_syscall_intercept/include/debug.h new file mode 100644 index 0000000000..646aef755e --- /dev/null +++ b/splitfs_syscall_intercept/include/debug.h @@ -0,0 +1,122 @@ +#ifndef __DEBUG_INCLUDED +#define __DEBUG_INCLUDED + +#include "boost/preprocessor/list/for_each.hpp" + +// Turns on debugging messages +#ifndef SHOW_DEBUG +#define SHOW_DEBUG 0 +#endif + +#ifndef PRINT_DEBUG_FILE +#define PRINT_DEBUG_FILE 0 +#endif + +#ifndef SPIN_ON_ERROR +#define SPIN_ON_ERROR 0 +#endif + +//#define ENV_GDB_VEC "NVP_GDB_VEC" +/* +#define fopen fopen_orig +#undef fopen +*/ +#include +#include +#include +#include + +extern FILE* _nvp_print_fd; + +// use stderr, until we dup it +#define NVP_PRINT_FD ((_nvp_print_fd)?_nvp_print_fd:stderr) + +typedef char* charptr; +void xil_printf(FILE* f, const charptr c, ...); +//static inline void _nvp_debug_handoff(void) + +/* +#define _nvp_debug_handoff(x) \ +{ \ + xil_printf(stderr, "Stopping thread and waiting for gdb...\ngdb --pid=%i\n", getpid()); \ + fflush(stderr); \ + sleep(1); \ + volatile int asdf = 1; \ + sleep(1); \ + while(asdf) {}; \ +} +*/ + +#define _nvp_debug_handoff(x) \ +{ \ + sleep(1); \ + volatile int asdf = 1; \ + sleep(1); \ + while(asdf) {}; \ +} + + +//void outbyte(char c); + +//#define ERROR_NAMES (EPERM) (ENOENT) (ESRCH) (EINTR) (EIO) (ENXIO) (E2BIG) (ENOEXEC) (EBADF) (ECHILD) (EAGAIN) (ENOMEM) (EACCES) (EFAULT) (ENOTBLK) (EBUSY) (EEXIST) (EXDEV) (ENODEV) (ENOTDIR) (EISDIR) (EINVAL) (ENFILE) (EMFILE) (ENOTTY) (ETXTBSY) (EFBIG) (ENOSPC) (ESPIPE) (EROFS) (EMLINK) (EPIPE) (EDOM) (ERANGE) (EDEADLK) +#define ERROR_NAMES_LIST (EPERM, (ENOENT, (ESRCH, (EINTR, (EIO, (ENXIO, (E2BIG, (ENOEXEC, (EBADF, (ECHILD, (EAGAIN, (ENOMEM, (EACCES, (EFAULT, (ENOTBLK, (EBUSY, (EEXIST, (EXDEV, (ENODEV, (ENOTDIR, (EISDIR, (EINVAL, (ENFILE, (EMFILE, (ENOTTY, (ETXTBSY, (EFBIG, (ENOSPC, (ESPIPE, (EROFS, (EMLINK, (EPIPE, (EDOM, (ERANGE, (EDEADLK, BOOST_PP_NIL))))))))))))))))))))))))))))))))))) + +#define ERROR_IF_PRINT(r, data, elem) if(data == elem) { DEBUG("errno == %s (%i): %s\n", MK_STR(elem), elem, strerror(elem)); } + +// also used in fileops_wrap +//#define PRINTFUNC fprintf +#define PRINTFUNC xil_printf + + +#if DISABLE_MSG + #define MSG(format, ...) do{}while(0) +#else + #define MSG(format, ...) do{PRINTFUNC(NVP_PRINT_FD, "MSG: "); PRINTFUNC (NVP_PRINT_FD, format, ##__VA_ARGS__); fflush(NVP_PRINT_FD); }while(0) +#endif +#define LOG(format, ...) do{PRINTFUNC(NVP_PRINT_FD, "MSG: "); PRINTFUNC (NVP_PRINT_FD, format, ##__VA_ARGS__); fflush(NVP_PRINT_FD); }while(0) +#define ERROR(format, ...) do{PRINTFUNC(NVP_PRINT_FD, "\033[01;33mNVP_ERROR\e[m (pid %i): " format, getpid(), ##__VA_ARGS__); PRINTFUNC(NVP_PRINT_FD, "ROHAN HERE\n"); if(SPIN_ON_ERROR){ _nvp_debug_handoff(); } }while(0) + +extern FILE *debug_fd; +#define DEBUG_FD debug_fd + +#if PRINT_DEBUG_FILE +#define DEBUG_FILE(format, ...) do {PRINTFUNC(DEBUG_FD, "\033[01;33mNVP_DEBUG\ +\e[m (pid %i): " format, getpid(), ##__VA_ARGS__); }while(0) +#else +#define DEBUG_FILE(format, ...) do{}while(0) +#endif + +#if SHOW_DEBUG + #define DEBUG(format, ...) do{char loc; PRINTFUNC(NVP_PRINT_FD, "NVP_DEBUG (PID %i SP %p): " format, getpid(), &loc, ##__VA_ARGS__); fflush(NVP_PRINT_FD); } while(0) + #define WARNING(format, ...) do{PRINTFUNC(NVP_PRINT_FD, "NVP_WARNING (PID %i): " format, getpid(), ##__VA_ARGS__); } while(0) + #define DEBUG_P(format, ...) do{PRINTFUNC(NVP_PRINT_FD, format, ##__VA_ARGS__); } while(0) +#else + #define DEBUG(format, ...) do{}while(0) + #define WARNING(format, ...) do{}while(0) + #define DEBUG_P(format, ...) do{}while(0) +#endif + +#define FAIL \ +"FFFFFFFFFFFFFFFFFFFFFF AAA IIIIIIIIII LLLLLLLLLLL \n"\ +"F::::::::::::::::::::F A:::A I::::::::I L:::::::::L \n"\ +"F::::::::::::::::::::F A:::::A I::::::::I L:::::::::L \n"\ +"FF::::::FFFFFFFFF::::F A:::::::A II::::::II LL:::::::LL \n"\ +" F:::::F FFFFFF A:::::::::A I::::I L:::::L \n"\ +" F:::::F A:::::A:::::A I::::I L:::::L \n"\ +" F::::::FFFFFFFFFF A:::::A A:::::A I::::I L:::::L \n"\ +" F:::::::::::::::F A:::::A A:::::A I::::I L:::::L \n"\ +" F:::::::::::::::F A:::::A A:::::A I::::I L:::::L \n"\ +" F::::::FFFFFFFFFFA:::::AAAAAAAAA:::::A I::::I L:::::L \n"\ +" F:::::F A:::::::::::::::::::::A I::::I L:::::L \n"\ +" F:::::F A:::::AAAAAAAAAAAAA:::::A I::::I L:::::L LLLLLL\n"\ +"FF:::::::FF A:::::A A:::::A II::::::II LL:::::::LLLLLLLLL:::::L\n"\ +"F::::::::FF A:::::A A:::::A I::::::::I L::::::::::::::::::::::L\n"\ +"F::::::::FF A:::::A A:::::A I::::::::I L::::::::::::::::::::::L\n"\ +"FFFFFFFFFFF AAAAAAA AAAAAAAIIIIIIIIII LLLLLLLLLLLLLLLLLLLLLLLL\n" + +#endif + +//#define PRINT_ERROR_NAME(errnoin) BOOST_PP_SEQ_FOR_EACH(ERROR_IF_PRINT, errnoin, ERROR_NAMES) // can't use BOOST_PP_SEQ_FOR_EACH within another BOOST_PP_SEQ_FOR_EACH +#define PRINT_ERROR_NAME(errnoin) _nvp_print_error_name(errnoin); +void _nvp_print_error_name(int errnoin); + diff --git a/splitfs_syscall_intercept/include/nv_common.h b/splitfs_syscall_intercept/include/nv_common.h new file mode 100644 index 0000000000..32b4e589a4 --- /dev/null +++ b/splitfs_syscall_intercept/include/nv_common.h @@ -0,0 +1,157 @@ +// Header file shared by nvmfileops.c, fileops_compareharness.c +#define _GNU_SOURCE + +#ifndef __NV_COMMON_H_ +#define __NV_COMMON_H_ + +#ifndef __cplusplus +#endif + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "debug.h" +#include "boost/preprocessor/seq/for_each.hpp" + +#define MIN(X,Y) (((X)<(Y))?(X):(Y)) +#define MAX(X,Y) (((X)>(Y))?(X):(Y)) + +// tell the compiler a branch is/is not likely to be followed +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) + +#define assert(x) if(UNLIKELY(!(x))) { printf("ASSERT FAILED ROHAN\n"); fflush(NULL); ERROR("NVP_ASSERT("#x") failed!\n"); exit(100); } + +// ----------------- Syscall Intercept Stuff ---------------- +#define INTF_SYSCALL long arg0, long arg1, long arg2, long arg3, long arg4, long arg5, long *result +#define RETT_SYSCALL_INTERCEPT int + +// Pass thru call to kernel +#define RETT_PASS_KERN 1 + +// Took over call. Don't pass to kernel. +#define RETT_NO_PASS_KERN 0 +// ---------------------------------------------------------- + +#define DO_ALIGNMENT_CHECKS 0 + +// places quotation marks around arg (eg, MK_STR(stuff) becomes "stuff") +#define MK_STR(arg) #arg +#define MK_STR2(x) MK_STR(x) +#define MK_STR3(x) MK_STR2(x) + +#define MACRO_WRAP(a) a +#define MACRO_CAT(a, b) MACRO_WRAP(a##b) + +#ifndef __cplusplus +typedef int bool; +#define false 0 +#define true 1 +#endif + + +#define BG_CLOSING 0 +#define SEQ_LIST 0 +#define RAND_LIST 1 + +// maximum number of file operations to support simultaneously +#define MAX_FILEOPS 32 +#define BUF_SIZE 40 + +// Every time a function is used, determine whether the module's functions have been resolved. +#include +#include +#include +#include +#include + +extern int OPEN_MAX; + +#define NOSANITYCHECK 1 +#if NOSANITYCHECK + #define SANITYCHECK(x) +#else + #define SANITYCHECK(x) if(UNLIKELY(!(x))) { ERROR("NVP_SANITY("#x") failed!\n"); exit(101); } +#endif + +#define ASYNC_CLOSING async_close_enable +volatile int async_close_enable; + +// Used to determine contents of flags passed to OPEN +#define FLAGS_INCLUDE(flags, x) ((flags&x)||(x==0)) +#define DUMP_FLAGS(flags, x) do{ if(FLAGS_INCLUDE(flags, x)) { DEBUG_P("%s(0x%X) ",#x,x); } }while(0) + + +#define NVP_CHECK_NVF_VALID(nvf) do{ \ + if(UNLIKELY(!nvf->valid)) { \ + DEBUG("Invalid file descriptor: %i\n", file); \ + errno = 0; \ + return -1; \ + } \ + else \ + { \ + DEBUG("this function is operating on node %p\n", nvf->node); \ + } \ +} while(0) + +#define NVP_CHECK_NVF_VALID_WR(nvf) do{ \ + if(UNLIKELY(!nvf->valid)) { \ + DEBUG("Invalid file descriptor: %i\n", file); \ + errno = 0; \ + return -1; \ + } \ + else { \ + DEBUG("this function is operating on node %p\n", nvf->node); \ + } \ +} while(0) + +#define IS_ERR(x) ((unsigned long)(x) >= (unsigned long)-4095) + +// modifications to support different FSYNC policies +#define NVMM_PATH "/mnt/pmem_emul/" + +#define SANITYCHECKNVF(nvf) \ + SANITYCHECK(nvf->valid); \ + SANITYCHECK(nvf->node != NULL); \ + SANITYCHECK(nvf->fd >= 0); \ + SANITYCHECK(nvf->fd < OPEN_MAX); \ + SANITYCHECK(nvf->offset != NULL); \ + SANITYCHECK(*nvf->offset >= 0); \ + SANITYCHECK(nvf->node->length >=0); \ + SANITYCHECK(nvf->node->maplength >= nvf->node->length); \ + SANITYCHECK(nvf->node->data != NULL) + + +#define SFS_OPS (CLOSE) (DUP) (DUP2) (EXECVE) (FSYNC) (LINK) (MKDIR) (MKDIRAT) (MKNOD) (MKNODAT) (OPEN) (READ) \ + (RENAME) (RMDIR) (SEEK) (SYMLINK) (SYMLINKAT) (UNLINK) (UNLINKAT) (WRITE) +#define DECLARE_SFS_FUNCS(FUNCT, prefix) \ + RETT_SYSCALL_INTERCEPT prefix##FUNCT(INTF_SYSCALL); +#define DECLARE_SFS_FUNCS_IWRAP(r, data, elem) DECLARE_SFS_FUNCS(elem, data) + +BOOST_PP_SEQ_FOR_EACH(DECLARE_SFS_FUNCS_IWRAP, _sfs_, SFS_OPS) + +#endif diff --git a/splitfs_syscall_intercept/src/CMakeLists.txt b/splitfs_syscall_intercept/src/CMakeLists.txt new file mode 100644 index 0000000000..58ef46035d --- /dev/null +++ b/splitfs_syscall_intercept/src/CMakeLists.txt @@ -0,0 +1,174 @@ +# +# Copyright 2017, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +set(HUB_SOURCES + fileops_hub.c + nvp_printf.c + ) + +set(NVP_SOURCES + add_delay.c + dup.c + file.c + fsync.c + handle_mmaps.c + ioctl.c + log.c + mmap_cache.c + non_temporal.c + nvp_printf.c + read.c + relink.c + seek.c + splitfs_posix.c + stack.c + staging.c + tbl_mmaps.c + timers.c + truncate.c + unlink.c + write.c + ) + +set(EXPORTED_SYMBOLS + splitfs_dup + splitfs_file + splitfs_fsync + splitfs_ioctl + splitfs_read + splitfs_seek + splitfs_truncate + splitfs_unlink + splitfs_write + ) + +if(PKG_CONFIG_FOUND) + pkg_check_modules(SYSCALL_INTERCEPT libsyscall_intercept) + pkg_check_modules(CAP libcap) +else() + find_package(SYSCALL_INTERCEPT QUIET) + find_package(CAP QUIET) +endif() + +if(NOT CAP_FOUND) + message(FATAL_ERROR + "libcap not found - needed by libpmemfile + to skip building libpmemfile, set the BUILD_LIBPMEMFILE option to OFF") +endif() + +if(NOT SYSCALL_INTERCEPT_FOUND) + message(FATAL_ERROR + "libsyscall_intercept not found - needed by libpmemfile + to skip building libpmemfile, set the BUILD_LIBPMEMFILE option to OFF") +endif() + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +add_definitions(-DPRINT_DEBUG_FILE=0 -DDATA_JOURNALING_ENABLED=1 + -DPOSIX_ENABLED=0 -DNVM_DELAY=0 -DSYSCALL_APPENDS=0 -DPASS_THROUGH_CALLS=0 + -DINSTRUMENT_CALLS=0 -DSHOW_DEBUG=0 -DSPIN_ON_ERROR=0 -DUSE_PTHREAD_LOCK=0 + -DUSE_SCHED_GETCPU=1 -DINTEGRITY_CHECK=0 -DMEASURE_TIMING=0 + -DUSE_SINGLE_LOCK=0 -DENABLE_FSYNC_TO_BS=0 -DENABLE_FSYNC_TO_CACHE=0 + -DENABLE_FALLOC=1 -DUSE_BTREE=1 -DUNMAP_ON_CLOSE=0) + +# XXX OBJECT library type is not supported on some old cmake versions + +add_library(fileops_nvp_o OBJECT ${NVP_SOURCES}) + +add_library(nvp_shared SHARED $) +add_library(nvp_static_unscoped STATIC $) + +target_link_libraries(nvp_shared PRIVATE ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(nvp_shared PRIVATE ${CMAKE_DL_LIBS}) +target_link_libraries(nvp_shared PRIVATE + -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libsplitfs.map) + +set_target_properties(nvp_static_unscoped PROPERTIES OUTPUT_NAME fileops_nvp_unscoped) +set_target_properties(nvp_shared PROPERTIES OUTPUT_NAME fileops_nvp) +#set_target_properties(nvp_shared PROPERTIES VERSION ${VERSION} SOVERSION ${VERSION_MAJOR}) + +add_custom_command(OUTPUT libfileops_nvp.a + COMMAND objcopy --localize-hidden `sed -n + "'s/^\\s*\\([a-zA-Z0-9_]*\\);$$/-G \\1/p'" + ${CMAKE_CURRENT_SOURCE_DIR}/libsplitfs.map` libfileops_nvp_unscoped.a + libfileops_nvp.a + DEPENDS nvp_static_unscoped) +add_custom_target(nvp_static ALL DEPENDS libfileops_nvp.a) + +install(TARGETS nvp_shared LIBRARY + CONFIGURATIONS Release None RelWithDebInfo + DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +install(TARGETS nvp_shared LIBRARY + CONFIGURATIONS Debug + DESTINATION ${CMAKE_INSTALL_LIBDIR}/nvp_debug) + +add_library(splitfs_o OBJECT ${HUB_SOURCES}) + +add_library(splitfs_shared SHARED $) +add_library(splitfs_static_unscoped STATIC $) + +target_link_libraries(splitfs_shared PRIVATE ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(splitfs_shared PRIVATE ${CMAKE_DL_LIBS}) +target_link_libraries(splitfs_shared PRIVATE nvp_shared) +target_link_libraries(splitfs_shared PRIVATE + -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libsplitfs.map) + +set_target_properties(splitfs_static_unscoped PROPERTIES OUTPUT_NAME splitfs_unscoped) +set_target_properties(splitfs_shared PROPERTIES OUTPUT_NAME splitfs) +#set_target_properties(splitfs_shared PROPERTIES VERSION ${VERSION} SOVERSION ${VERSION_MAJOR}) + +add_custom_command(OUTPUT libsplitfs.a + COMMAND objcopy --localize-hidden `sed -n + "'s/^\\s*\\([a-zA-Z0-9_]*\\);$$/-G \\1/p'" + ${CMAKE_CURRENT_SOURCE_DIR}/libsplitfs.map` libsplitfs_unscoped.a + libsplitfs.a + DEPENDS splitfs_static_unscoped) +add_custom_target(splitfs_static ALL DEPENDS libsplitfs.a) + +install(TARGETS splitfs_shared LIBRARY + CONFIGURATIONS Release None RelWithDebInfo + DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +install(TARGETS splitfs_shared LIBRARY + CONFIGURATIONS Debug + DESTINATION ${CMAKE_INSTALL_LIBDIR}/splitfs_debug) + + +#install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libpmemfile.a +# CONFIGURATIONS Release None +# DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +#install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libpmemfile.a +# CONFIGURATIONS Debug +# DESTINATION ${CMAKE_INSTALL_LIBDIR}/pmemfile_debug) + +add_cstyle(splitfs) +add_check_whitespace(splitfs) diff --git a/splitfs_syscall_intercept/src/Makefile b/splitfs_syscall_intercept/src/Makefile new file mode 100755 index 0000000000..1b408a845c --- /dev/null +++ b/splitfs_syscall_intercept/src/Makefile @@ -0,0 +1,44 @@ +# Common to all lib*.so +COMMON_OBJ=nvp_printf.o + +all: sofiles $(COMMON_OBJ) + +include common.mk + +sofiles: $(COMMON_OBJ) $(NVP_SOFILES) + + +# lib*.so files # + +# Uncomment the above in the end. +libnvp.so: $(COMMON_OBJ) add_delay.o bg_clear_mmap.o dup.o execve.o file.o fsync.o handle_mmaps.o link.o log.o lru_cache.o mkdir.o mknod.o mmap_cache.o non_temporal.o read.o relink.o rename.o rmdir.o seek.o splitfs_posix.o stack.o staging.o tbl_mmaps.o thread_handle.o timers.o unlink.o utils.o write.o lfq.o + $(CC) $(CFLAGS) -fpic -shared -Wl,-soname,$@ -o $@ $^ -ldl -funroll-loops -L. -lsyscall_intercept -lrt + +memcpy.o: memcpy.c + $(CC) -c $< -o $@ -O4 -march=core2 -m64 -fPIC + + +# Boring stuff # + +clean: + rm -f *.o *.i *.ci *.so xdd_result_*.txt logs/*.result iogen simplecat + +.PRECIOUS: %.i +%.i : %.c + $(CC) $(CFLAGS) -c -E $< -o $@ + +.PRECIOUS: %.ci +%.ci : %.i + indent < $< | grep -v '^#' > $@ + +%.oi : %.ci + $(CC) $(CFLAGS) -x c -c $< -o $@ + +%.o : %.c + $(CC) $(CFLAGS) -I../include -c -Wno-unknown-pragmas $< -o $@ + +%.o : %.S + $(CC) $(COPTIMIZATIONS) -fPIC -c $< -o $@ + +include make-hs.mk + diff --git a/splitfs_syscall_intercept/src/add_delay.c b/splitfs_syscall_intercept/src/add_delay.c new file mode 100755 index 0000000000..ab0f67ae80 --- /dev/null +++ b/splitfs_syscall_intercept/src/add_delay.c @@ -0,0 +1,110 @@ +#include "add_delay.h" + +// Set CPU frequency correctly +#define _CPUFREQ 3600LLU /* MHz */ + +#define NS2CYCLE(__ns) (((__ns) * _CPUFREQ) / 1000) + +#define BANDWIDTH_MONITOR_NS 10000 +#define SEC_TO_NS(x) (x * 1000000000UL) + +#define ENABLE_PERF_MODEL + +// performance parameters +/* SCM read extra latency than DRAM */ +uint32_t SCM_EXTRA_READ_LATENCY_NS = 220; +// We assume WBARRIER LATENCY is 0 since write back queue can hide this even in +// power failure. +// https://software.intel.com/en-us/blogs/2016/09/12/deprecate-pcommit-instruction +uint32_t SCM_WBARRIER_LATENCY_NS = 0; + +/* SCM write bandwidth */ +uint32_t SCM_BANDWIDTH_MB = 21000; +/* DRAM system peak bandwidth */ +uint32_t DRAM_BANDWIDTH_MB = 63000; + +uint64_t bandwidth_consumption; +static uint64_t monitor_start = 0, monitor_end = 0, now = 0; + +pthread_mutex_t mlfs_nvm_mutex; + +static inline void PERSISTENT_BARRIER(void) +{ + asm volatile ("sfence\n" : : ); +} + +/////////////////////////////////////////////////////// + +static inline void emulate_latency_ns(uint32_t ns) +{ + uint64_t cycles, start, stop; + + start = asm_rdtscp(); + cycles = NS2CYCLE(ns); + //printf("cycles %lu\n", cycles); + + do { + /* RDTSC doesn't necessarily wait for previous instructions to complete + * so a serializing instruction is usually used to ensure previous + * instructions have completed. However, in our case this is a desirable + * property since we want to overlap the latency we emulate with the + * actual latency of the emulated instruction. + */ + stop = asm_rdtscp(); + } while (stop - start < cycles); +} + +void perfmodel_add_delay(int read, size_t size) +{ +#ifdef ENABLE_PERF_MODEL + uint32_t extra_latency; + uint32_t do_bandwidth_delay; + + // Only allowed for mkfs. + /* + if (!bandwidth_consumption) { + if (!warning) { + printf("\033[31m WARNING: Bandwidth tracking variable is not set." + " Running program must be mkfs \033[0m\n"); + warning = 1; + } + return ; + } + */ + + now = asm_rdtscp(); + + if (now >= monitor_end) { + monitor_start = now; + monitor_end = monitor_start + NS2CYCLE(BANDWIDTH_MONITOR_NS); + bandwidth_consumption = 0; + } + + if (__sync_add_and_fetch(&bandwidth_consumption, size) >= + ((SCM_BANDWIDTH_MB << 20) / (SEC_TO_NS(1UL) / BANDWIDTH_MONITOR_NS))) + do_bandwidth_delay = 1; + else + do_bandwidth_delay = 0; + + if (read) { + extra_latency = SCM_EXTRA_READ_LATENCY_NS; + } else + extra_latency = SCM_WBARRIER_LATENCY_NS; + + // bandwidth delay for both read and write. + if (do_bandwidth_delay) { + // Due to the writeback cache, write does not have latency + // but it has bandwidth limit. + // The following is emulated delay when bandwidth is full + extra_latency += (float) ((int)size * + (1 - (float)(((float) SCM_BANDWIDTH_MB)/1000) / + (((float)DRAM_BANDWIDTH_MB)/1000)) / (((float)SCM_BANDWIDTH_MB)/1000)); + pthread_mutex_lock(&mlfs_nvm_mutex); + emulate_latency_ns(extra_latency); + pthread_mutex_unlock(&mlfs_nvm_mutex); + } else + emulate_latency_ns(extra_latency); + +#endif + return; +} diff --git a/splitfs_syscall_intercept/src/add_delay.h b/splitfs_syscall_intercept/src/add_delay.h new file mode 100755 index 0000000000..63ff7d11dc --- /dev/null +++ b/splitfs_syscall_intercept/src/add_delay.h @@ -0,0 +1,17 @@ +#ifndef _LEDGER_ADD_DELAY_H_ +#define _LEDGER_ADD_DELAY_H_ + +// to use O_DIRECT flag +// +#include +#include +#include +#include +#include +#include + +#include "util.h" + +void perfmodel_add_delay(int read, size_t size); + +#endif diff --git a/splitfs_syscall_intercept/src/bg_clear_mmap.c b/splitfs_syscall_intercept/src/bg_clear_mmap.c new file mode 100644 index 0000000000..f10dafc8cd --- /dev/null +++ b/splitfs_syscall_intercept/src/bg_clear_mmap.c @@ -0,0 +1,189 @@ +/* + * ===================================================================================== + * + * Filename: bg_clear_mmap.c + * + * Description: + * + * Version: 1.0 + * Created: 09/28/2019 06:41:42 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#include +#include +#include "bg_clear_mmap.h" +#include "add_delay.h" + +static pthread_t bg_cleaning_thread; +static pthread_cond_t bg_cleaning_signal; +static pthread_mutex_t mu_clean; +static int clean_overwrite; + +static void clean_dr_mmap() { + struct free_dr_pool *temp_dr_good_info = NULL; + int dr_fd = 0, ret = 0, i = 0, j = 0; + int num_blocks = clean_overwrite == 1 ? (DR_OVER_SIZE / MMAP_PAGE_SIZE) : (DR_SIZE / MMAP_PAGE_SIZE); + size_t mmap_size = clean_overwrite == 1 ? DR_OVER_SIZE : DR_SIZE; + char prefault_buf[MMAP_PAGE_SIZE]; + struct stat stat_buf; + char dr_fname[256]; + + DEBUG_FILE("%s: Enterred BG thread successfully. Will mmap\n", __func__); + + for (i = 0; i < MMAP_PAGE_SIZE; i++) + prefault_buf[i] = '0'; + for (i = 0; i < BG_NUM_DR; i++) { + temp_dr_good_info = (struct free_dr_pool *) malloc(sizeof(struct free_dr_pool)); + if (clean_overwrite) + sprintf(dr_fname, "%s%s", NVMM_PATH, "DR-OVER-XXXXXX"); + else + sprintf(dr_fname, "%s%s", NVMM_PATH, "DR-XXXXXX"); + dr_fd = syscall_no_intercept(SYS_open, mktemp(dr_fname), O_RDWR | O_CREAT, 0666); + if (dr_fd < 0) { + MSG("%s: mkstemp of DR file failed. Err = %s\n", + __func__, strerror(-dr_fd)); + assert(0); + } + + ret = posix_fallocate(dr_fd, 0, mmap_size); + + if (ret < 0) { + MSG("%s: posix_fallocate failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + + temp_dr_good_info->start_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + mmap_size, + PROT_READ | PROT_WRITE, //max_perms, + MAP_SHARED | MAP_POPULATE, + dr_fd, //fd_with_max_perms, + 0 + ); + if (temp_dr_good_info->start_addr == 0) { + MSG("%s: mmap failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + for (j = 0; j < num_blocks; j++) { +#if NON_TEMPORAL_WRITES + if(MEMCPY_NON_TEMPORAL((char *)temp_dr_good_info->start_addr + j*MMAP_PAGE_SIZE, prefault_buf, MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy failed\n", __func__); + assert(0); + } +#else //NON_TEMPORAL_WRITES + if(FSYNC_MEMCPY((char *)temp_dr_good_info->start_addr + j*MMAP_PAGE_SIZE, prefault_buf, MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy failed\n", __func__); + assert(0); + } +#endif //NON_TEMPORAL_WRITES +#if NVM_DELAY + perfmodel_add_delay(0, MMAP_PAGE_SIZE); +#endif //NVM_DELAY + } + + num_mmap++; + num_drs++; + fstat(dr_fd, &stat_buf); + temp_dr_good_info->dr_serialno = stat_buf.st_ino; + temp_dr_good_info->valid_offset = 0; + if (clean_overwrite) { + temp_dr_good_info->dr_offset_start = 0; + temp_dr_good_info->dr_offset_end = DR_OVER_SIZE; + } else { + temp_dr_good_info->dr_offset_start = DR_SIZE; + temp_dr_good_info->dr_offset_end = temp_dr_good_info->valid_offset; + } + temp_dr_good_info->dr_fd = dr_fd; + DEBUG_FILE("%s: Unmapped and mapped DR file again\n", __func__); + DEBUG_FILE("%s: REMAPPED USELESS FILE dr_fd = %d, dr addr = %p, dr v.o = %lu, dr off start = %lu, dr off end = %lu\n", + __func__, temp_dr_good_info->dr_fd, temp_dr_good_info->start_addr, temp_dr_good_info->valid_offset, + temp_dr_good_info->dr_offset_start, temp_dr_good_info->dr_offset_end); + + //LFDS711_QUEUE_UMM_SET_VALUE_IN_ELEMENT(temp_dr_good_info->qe, temp_dr_good_info); + if (clean_overwrite) { + if (lfq_enqueue(&staging_over_mmap_queue_ctx, temp_dr_good_info) != 0) + assert(0); + //lfds711_queue_umm_enqueue( &qs_over, &(temp_dr_good_info->qe) ); + } else { + if (lfq_enqueue(&staging_mmap_queue_ctx, temp_dr_good_info) != 0) + assert(0); + //lfds711_queue_umm_enqueue( &qs, &(temp_dr_good_info->qe) ); + } + dr_fname[0] = '\0'; + __atomic_fetch_add(&num_drs_left, 1, __ATOMIC_SEQ_CST); + } + + DEBUG_FILE("%s: Returning successfully\n", __func__); + DEBUG_FILE("%s: ------------- \n", __func__); + + run_background_cleaning_thread = 0; + clean_overwrite = 0; +} + +static void *bgThreadCleaningWrapper() { + start: + pthread_mutex_lock(&mu_clean); + waiting_for_cleaning_signal = 1; + while(!run_background_cleaning_thread) { + pthread_cond_wait(&bg_cleaning_signal, &mu_clean); + } + waiting_for_cleaning_signal = 0; + pthread_mutex_unlock(&mu_clean); + clean_dr_mmap(); + if(!exit_bg_cleaning_thread) + goto start; + started_bg_cleaning_thread = 0; + return NULL; +} + +static void activateBgCleaningThread(int is_overwrite) { + pthread_mutex_lock(&mu_clean); + run_background_cleaning_thread = 1; + if (is_overwrite) + clean_overwrite = 1; + else + clean_overwrite = 0; + pthread_cond_signal(&bg_cleaning_signal); + pthread_mutex_unlock(&mu_clean); +} + +void startBgCleaningThread() { + if (!started_bg_cleaning_thread) { + started_bg_cleaning_thread = 1; + pthread_create(&bg_cleaning_thread, NULL, &bgThreadCleaningWrapper, NULL); + } +} + +void waitForBgCleaningThread() { + if(started_bg_cleaning_thread) { + pthread_join(bg_cleaning_thread, NULL); + } +} + +void cancelBgCleaningThread() { + if(started_bg_cleaning_thread) { + pthread_cancel(bg_cleaning_thread); + pthread_testcancel(); + } +} + +void initEnvForBgClean() { + pthread_cond_init(&bg_cleaning_signal, NULL); + pthread_mutex_init(&mu_clean, NULL); +} + +void callBgCleaningThread(int is_overwrite) { + if(run_background_cleaning_thread) + return; + calledBgCleaningThread++; + activateBgCleaningThread(is_overwrite); +} diff --git a/splitfs_syscall_intercept/src/bg_clear_mmap.h b/splitfs_syscall_intercept/src/bg_clear_mmap.h new file mode 100644 index 0000000000..6118c10d91 --- /dev/null +++ b/splitfs_syscall_intercept/src/bg_clear_mmap.h @@ -0,0 +1,24 @@ +#ifndef __NV_CLEAN_THREAD_HANDLER_H_ +#define __NV_CLEAN_THREAD_HANDLER_H_ + +#include +#include + +#include "file.h" +#include "handle_mmaps.h" +#include "timers.h" +#include "fsync.h" + +extern int run_background_cleaning_thread; +extern int started_bg_cleaning_thread; +extern int exit_bg_cleaning_thread; +extern int calledBgCleaningThread; +extern int waiting_for_cleaning_signal; + +void startBgCleaningThread(); +void waitForBgCleaningThread(); +void cancelBgCleaningThread(); +void initEnvForBgClean(); +void callBgCleaningThread(int is_overwrite); + +#endif diff --git a/splitfs_syscall_intercept/src/common.mk b/splitfs_syscall_intercept/src/common.mk new file mode 100755 index 0000000000..66c726accb --- /dev/null +++ b/splitfs_syscall_intercept/src/common.mk @@ -0,0 +1,134 @@ + +CC = gcc -g +LD = g++ +CXX=g++ +MAKE = make + +LD = g++ +CXX = g++ + +SYSTEM_TYPE ?= SYSTEM_TYPE_BEE3 # SYSTEM_TYPE_BEE3 or SYSTEM_TYPE_XUPV5 + +#MONETA_LIB_BUILD=DEBUG +MONETA_LIB_BUILD=RELEASE +SDSSD_LIB_BUILD=RELEASE + +#MONETA_LIB_VERSION=-sim +#MONETA_BUILD_TARGET=build-libsim +MONETA_LIB_VERSION= +MONETA_BUILD_TARGET=build-lib + +export BEE3HOME=/root + +LOG_OUT_DIR=$(BEE3HOME)/test/PosixNVM/logs + +DISABLE_MSG=1 +LIBNVP_DEBUG=0 +DEBUG_INTERCEPTIONS=0 + +# LEDGER VARS +LEDGER_DEBUG=0 +MOVNTI=1 +SYS_APPENDS=0 +SYS_PASS_THROUGH=0 +DELAYS=0 +LEDGER_DR_BG_CLEAN=1 +LEDGER_INSTRU=0 + +# WORKLOADS +LEDGER_YCSB=1 +LEDGER_TPCC=0 +LEDGER_REDIS=0 +LEDGER_TAR=0 +LEDGER_GIT=0 +LEDGER_RSYNC=0 + +LEDGER_TRACE_FP=1 + +# GUARANTEES +LEDGER_DATAJ=0 +LEDGER_POSIX=1 + +# END LEDGER VARS + + + +LIBNVP_SPIN_ON_ERROR=0 + +USE_PTHREAD_LOCK=0 +USE_SCHED_GETCPU=1 + +USE_SINGLE_LOCK=0 + +INTEGRITY_CHECK=0 +MEASURE_TIMING=0 + +USE_BTREE=1 +ENABLE_FSYNC_TO_BS=0 +ENABLE_FSYNC_TO_CACHE=0 +ENABLE_FALLOC=1 + +UNMAP_ON_CLOSE=0 + +#MONETA_LIB_DIR=$(BEE3HOME)/Tools/BEE3/library/src$(MONETA_LIB_VERSION)/build/$(MONETA_LIB_BUILD) +#SDSSD_LIB_DIR=${SDSSDHOME}/libs/sdssd/host/build/${SDSSD_LIB_BUILD}:${SDSSDHOME}/libs/io/host/build/${SDSSD_LIB_BUILD} +NVP_LIB_DIR=$(BEE3HOME)/test/PosixNVM + +MONETA_MOUNT_DIR = /mnt/beecube +LOCAL_TEST_DIR = /tmp/memuram0/nvp +RAMDISK_TEST_DIR = /tmp/memuram0/ + +NVP_TEST_DIR = /tmp/memuram0 + +MY_LD_LIB_PATH=$(PWD):$(NVP_LIB_DIR):$(MONETA_LIB_DIR):$(SDSSD_LIB_DIR):$$LD_LIBRARY_PATH + +LIBNVP_TREE_DIR=$(BEE3HOME)/test/PosixNVM/bin/ + +#LOAD_DIR_X=/x/HotStorage2011-NVP + +#COPTIMIZATIONS = -m64 +COPTIMIZATIONS = -O3 -m64 +#COPTIMIZATIONS = -O3 -march=core2 -m64 + +#-march=core2 -minline-all-stringops -m64 -fprefetch-loop-arrays +#-mno-align-stringops +#-DTRACE_FP_CALLS=$(LEDGER_TRACE_FP) +CFLAGS = -Doff64_t=__off64_t -DDISABLE_MSG=$(DISABLE_MSG) -DPRINT_DEBUG_FILE=$(LEDGER_DEBUG) -DDATA_JOURNALING_ENABLED=$(LEDGER_DATAJ) -DPOSIX_ENABLED=$(LEDGER_POSIX) -DTRACE_FP_CALLS=$(LEDGER_TRACE_FP) -DNVM_DELAY=$(DELAYS) -DNON_TEMPORAL_WRITES=$(MOVNTI) -DSYSCALL_APPENDS=$(SYS_APPENDS) -DPASS_THROUGH_CALLS=$(SYS_PASS_THROUGH) -DBG_CLEANING=$(LEDGER_DR_BG_CLEAN) -DINSTRUMENT_CALLS=$(LEDGER_INSTRU) -DWORKLOAD_YCSB=$(LEDGER_YCSB) -DWORKLOAD_TPCC=$(LEDGER_TPCC) -DWORKLOAD_REDIS=$(LEDGER_REDIS) -DWORKLOAD_TAR=$(LEDGER_TAR) -DWORKLOAD_GIT=$(LEDGER_GIT) -DWORKLOAD_RSYNC=$(LEDGER_RSYNC) -DSHOW_DEBUG=$(LIBNVP_DEBUG) -DSPIN_ON_ERROR=$(LIBNVP_SPIN_ON_ERROR) -Wno-unused-variable -Wall -Wundef -pthread -fPIC $(COPTIMIZATIONS) -D$(SYSTEM_TYPE) -DUSE_PTHREAD_LOCK=$(USE_PTHREAD_LOCK) -DUSE_SCHED_GETCPU=$(USE_SCHED_GETCPU) -DINTEGRITY_CHECK=$(INTEGRITY_CHECK) -DMEASURE_TIMING=$(MEASURE_TIMING) -DUSE_SINGLE_LOCK=$(USE_SINGLE_LOCK) -DENABLE_FSYNC_TO_BS=$(ENABLE_FSYNC_TO_BS) -DENABLE_FSYNC_TO_CACHE=$(ENABLE_FSYNC_TO_CACHE) -DENABLE_FALLOC=$(ENABLE_FALLOC) -DUSE_BTREE=$(USE_BTREE) -DUNMAP_ON_CLOSE=$(UNMAP_ON_CLOSE) -DDEBUG_INTERCEPTIONS=$(DEBUG_INTERCEPTIONS) + +CXXFLAGS=$(CFLAGS) + +MARKERRORS = sed -e "s/\(ERROR:\)/$$(tput bold;tput setaf 1)\1$$(tput sgr0)/g" | sed -e "s/\(WARNING:\)/$$(tput bold;tput setaf 3)\1$$(tput sgr0)/g" + +HIGHLIGHTERRORS = sed -e "s/\(total errors:\)/$$(tput bold;tput setaf 1)\1$$(tput sgr0)/gI" | sed -e "s/\(error:\)/$$(tput bold;tput setaf 1)\1$$(tput sgr0)/gI" + +HIGHLIGHTFAILURE = sed -e "s/\(FAILURE\)/$$(tput bold;tput setaf 1)\1$$(tput sgr0)/g" | sed -e "s/\(SUCCESS\)/$$(tput bold;tput setaf 2)\1$$(tput sgr0)/g" | sed -e "s/\(Assertion\)/$$(tput bold;tput setaf 1)FAILURE$$(tput sgr0): \1/g" + +SWAPSUCCESSFAILURE = sed -e "s/\(FAILURE\)/$$(tput bold;tput setaf 1)SECRETTEMPWORD1823$$(tput sgr0)/g" | sed -e "s/\(SUCCESS\)/$$(tput bold;tput setaf 2)FAILURE$$(tput sgr0)/g" | sed -e "s/\(SECRETTEMPWORD1823\)/$$(tput bold;tput setaf 2)SUCCESS$$(tput sgr0)/g" + +SPECIALCASEFORTESTTESTER = sed -e "s/\(test_tester_fail.testexe: RESULT: FAILURE\)/TEMPORARY/gI" | sed -e "s/\(test_tester_fail.testexe: RESULT: SUCCESS\)/test_tester_fail.testexe: RESULT: FAILURE/gI" | sed -e "s/\(TEMPORARY\)/test_tester_fail.testexe: RESULT: SUCCESS/gI" | sed -e "s/\(test_tester_fail.testexe: RESULT: FAILURE\)/\1 : DON'T TRUST THE REST OF THE TEST CASES!/gI" | sed -e "s/\(test_tester_success.testexe: RESULT: FAILURE\)/\1 : DON'T TRUST THE REST OF THE TEST CASES!/gI" | sed -e "s/\(DON'T TRUST THE REST OF THE TEST CASES!\)/$$(tput bold;tput setaf 1)\1$$(tput sgr0)/g" + +MARKINCOMPLETE = sed -e "s/\(result\)/\1: FAILURE: terminated prematurely/gI" + +MARKNOLOAD = sed -e "s/\(If you're reading this, the library is being loaded!\)/\1: FAILURE: did not load libnvp.so/gI" + +#TESTS = test_09.result +TESTS = test_tester_fail.result test_tester_success.result helloworld.result nvmfileops_test.result test_open.result test_multiplefiles.result test_simultaneousfd.result test_largefilecopy.result test_resizefiles.result test_zipper.result test_rand_zipper.result test_process_zipper.result test_thread.result test_invalid_seek.result test_rollingfd.result test_holes.result test_holes_trunc.result test_open_simultaneous.result test_open_perms.result test_holes_simultaneous.result nvmfileops_test_links.result test_odirect.result test_read_extended.result test_mkstemp.result test_stdout.result test_01.result test_02.result test_03.result test_04.result test_05.result test_07.result test_08.result test_09.result randomTest.result + +#SOFILES = fileops_hub.so nvmfileops.so fileops_compareharness.so wrapops.so fileops_filter.so moneta.so fileops_sem.so fileops_death.so +#NVP_SOFILES = libnvp.so fileops_nvm.so fileops_compareharness.so fileops_wrap.so fileops_filter.so fileops_sem.so fileops_death.so +NVP_SOFILES = libnvp.so # libfileops_nvp.so # libfileops_wrap.so libfileops_filter.so libfileops_sem.so libfileops_count.so libfileops_harness.so libfileops_perfcount.so libfileops_hackmmap.so libfileops_death.so #libfileops_bankshot2.so +#MONETA_SOFILES = libfileops_moneta.so libmoneta.so +#SDSSD_SOFILES = libfileops_sdssd.so +#BANKSHOT_SOFILES = libfileops_sdssdbs.so + +MONETA_DEV_PATH=/dev/bbd0 + +BDB_EXEC_DIR=$$BEE3HOME/Workloads/BDB/Multi + + +check_moneta_mounted: + if [ "`stat $(MONETA_MOUNT_DIR) | grep 'Device: fb00h' | wc -l`" == "0" ]; then echo "FAILURE: $(MONETA_MOUNT_DIR) is NOT a Moneta device!" | $(HIGHLIGHTFAILURE) ; exit 1; else echo "SUCCESS: $(MONETA_MOUNT_DIR) is on a Moneta device." | $(HIGHLIGHTFAILURE); fi + +check_moneta: + if [ "`stat . | grep 'Device: fb00h' | wc -l`" == "0" ]; then echo "FAILURE: Current directory is NOT on a Moneta device!" | $(HIGHLIGHTFAILURE) ; exit 1; else echo "SUCCESS: Current directory is on a Moneta device." | $(HIGHLIGHTFAILURE); fi + diff --git a/splitfs_syscall_intercept/src/cross-platform.h b/splitfs_syscall_intercept/src/cross-platform.h new file mode 100644 index 0000000000..104947a82d --- /dev/null +++ b/splitfs_syscall_intercept/src/cross-platform.h @@ -0,0 +1,99 @@ +#ifndef __CROSS_PLATFORM_H__ +#define __CROSS_PLATFORM_H__ +// bool define +#ifdef __KERNEL__ + #include +#else + #include +#endif + +// malloc free +#ifdef __KERNEL__ + #define malloc(x) kmalloc(x, GFP_KERNEL ) + #define free kfree + #define calloc(x,y) kmalloc(x*y, GFP_KERNEL | __GFP_ZERO ) + #include +#else + #include + #include +#endif + + +#ifndef asm + #define asm __asm +#endif + +#define cmpxchg( ptr, _old, _new ) { \ + volatile uint32_t *__ptr = (volatile uint32_t *)(ptr); \ + uint32_t __ret; \ + asm volatile( "lock; cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (_new), "0" (_old) \ + : "memory"); \ + ); \ + __ret; \ +} + +//#define CAS cmpxchg +#define ATOMIC_SET __sync_lock_test_and_set +#define ATOMIC_RELEASE __sync_lock_release + +#if defined __GNUC__ + #define ATOMIC_SUB __sync_sub_and_fetch + #define ATOMIC_SUB64 ATOMIC_SUB + #define CAS __sync_bool_compare_and_swap +#define XCHG __sync_lock_test_and_set // yes really. The 2nd arg is limited to 1 on machines with TAS but not XCHG. On x86 it's an arbitrary value + #define ATOMIC_ADD __sync_add_and_fetch + #define ATOMIC_ADD64 ATOMIC_ADD + #define mb __sync_synchronize +#if defined(__x86_64__) || defined(__i386) +// #define lmb() asm volatile( "lfence" ) +// #define smb() asm volatile( "sfence" ) + #define lmb() asm volatile("":::"memory") // compiler barrier only. runtime reordering already impossible on x86 + #define smb() asm volatile("":::"memory") + // "mfence" for lmb and smb makes assertion failures rarer, but doesn't eliminate, so it's just papering over the symptoms +#endif // else no definition + + // thread + #include + #include + #define THREAD_WAIT(x) pthread_join(x, NULL); + #define THREAD_ID pthread_self + #define THREAD_FN void * + #define THREAD_YIELD sched_yield + #define THREAD_TOKEN pthread_t + +#else + #include + #define ATOMIC_SUB(x,y) InterlockedExchangeAddNoFence(x, -y) + #define ATOMIC_SUB64(x,y) InterlockedExchangeAddNoFence64(x, -y) + #define ATOMIC_ADD InterlockedExchangeAddNoFence + #define ATOMIC_ADD64 InterlockedExchangeAddNoFence64 + #ifdef _WIN64 + #define mb() MemoryBarrier() + #define lmb() LoadFence() + #define smb() StoreFence() + inline bool __CAS(LONG64 volatile *x, LONG64 y, LONG64 z) { + return InterlockedCompareExchangeNoFence64(x, z, y) == y; + } + #define CAS(x,y,z) __CAS((LONG64 volatile *)x, (LONG64)y, (LONG64)z) + #else + #define mb() asm mfence + #define lmb() asm lfence + #define smb() asm sfence + inline bool __CAS(LONG volatile *x, LONG y, LONG z) { + return InterlockedCompareExchangeNoFence(x, z, y) == y; + } + #define CAS(x,y,z) __CAS((LONG volatile *)x, (LONG)y, (LONG)z) + #endif + + // thread + #include + #define THREAD_WAIT(x) WaitForSingleObject(x, INFINITE); + #define THREAD_ID GetCurrentThreadId + #define THREAD_FN WORD WINAPI + #define THREAD_YIELD SwitchToThread + #define THREAD_TOKEN HANDLE +#endif +#endif + diff --git a/splitfs_syscall_intercept/src/dup.c b/splitfs_syscall_intercept/src/dup.c new file mode 100644 index 0000000000..24c3347f07 --- /dev/null +++ b/splitfs_syscall_intercept/src/dup.c @@ -0,0 +1,293 @@ +/* + * ===================================================================================== + * + * Filename: dup.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:36:54 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#include +#include + +#include +#include "nvp_lock.h" +#include "file.h" +#include "inode.h" + +void _nvp_test_invalidate_node(struct NVFile* nvf) +{ + struct NVNode* node = nvf->node; + + DEBUG("munmapping temporarily diabled...\n"); // TODO + + return; + + SANITYCHECK(node!=NULL); + + pthread_spin_lock(&node_lookup_lock[(int) (pthread_self() % NUM_NODE_LISTS)]); + NVP_LOCK_NODE_WR(nvf); + node->reference--; + NVP_UNLOCK_NODE_WR(nvf); + if (node->reference == 0) { + NVP_LOCK_NODE_WR(nvf); + int index = nvf->serialno % 1024; + _nvp_ino_lookup[index] = 0; + // FIXME: Also munmap? + nvp_cleanup_node(nvf->node, 0, 1); + node->serialno = 0; + NVP_UNLOCK_NODE_WR(nvf); + } + pthread_spin_unlock(&node_lookup_lock[(int) (pthread_self() % NUM_NODE_LISTS)]); + +} + +RETT_SYSCALL_INTERCEPT _sfs_DUP(INTF_SYSCALL) +{ + DEBUG("In %s\n", __func__); + int ret = 0, file; + + file = (int)arg0; + + MSG("DUP. FD: %d.\n", file); + + if((file<0) || (file >= OPEN_MAX) ) { + MSG("fd %i is larger than the maximum number of open files; ignoring it.\n", file); + *result = -EBADF; + return RETT_NO_PASS_KERN; + } + + if(!_fd_intercept_lookup[file]) { + return RETT_PASS_KERN; + } + + struct NVFile* nvf = &_nvp_fd_lookup[file]; + + NVP_LOCK_FD_WR(nvf); + NVP_CHECK_NVF_VALID_WR(nvf); + NVP_LOCK_NODE_WR(nvf); // TODO + + ret = syscall_no_intercept(SYS_dup, file); + + if(ret < 0) + { + DEBUG("Call to %s failed: %s\n", __func__, strerror(-ret)); + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_WR(nvf); + //GLOBAL_UNLOCK_WR(); + *result = ret; + return RETT_NO_PASS_KERN; + } + + struct NVFile* nvf2 = &_nvp_fd_lookup[ret]; + + nvf->valid = 0; + nvf2->valid = 0; + + if (nvf->posix) { + DEBUG("Call posix DUP for fd %d\n", nvf->fd); + nvf2->posix = nvf->posix; + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_WR(nvf); + //GLOBAL_UNLOCK_WR(); + *result = ret; + return RETT_NO_PASS_KERN; + } + + NVP_LOCK_FD_WR(nvf2); + + if(nvf2->valid) { + ERROR("fd %i was already in use!\n", ret); + assert(!nvf2->valid); + } + else + { + //free(nvf2->offset); // TODO: free this iff it's not in use anymore to avoid memory leaks + } + + nvf2->fd = ret; + nvf2->offset = nvf->offset; + nvf2->canRead = nvf->canRead; + nvf2->canWrite = nvf->canWrite; + nvf2->append = nvf->append; + nvf2->aligned = nvf->aligned; + nvf2->serialno = nvf->serialno; + nvf2->node = nvf->node; + nvf2->posix = nvf->posix; + + SANITYCHECK(nvf2->node != NULL); + + nvf->node->reference++; + nvf->valid = 1; + nvf2->valid = 1; + + NVP_UNLOCK_NODE_WR(nvf); // nvf2->node->lock == nvf->node->lock since nvf and nvf2 share a node + NVP_UNLOCK_FD_WR(nvf); + NVP_UNLOCK_FD_WR(nvf2); + + GLOBAL_UNLOCK_WR(); + *result = nvf2->fd; + return RETT_NO_PASS_KERN; +} + +RETT_SYSCALL_INTERCEPT _sfs_DUP2(INTF_SYSCALL) +{ + DEBUG("In %s\n", __func__); + int ret = 0, file, fd2; + + file = (int)arg0; + fd2 = (int)arg1; + + MSG("DUP2. FD1: %d. FD2: %d\n", file, fd2); + + if((file<0) || (file >= OPEN_MAX) ) { + MSG("fd %i is larger than the maximum number of open files; ignoring it.\n", file); + *result = -EBADF; + return RETT_NO_PASS_KERN; + } + + if( (fd2<0) || (fd2 >= OPEN_MAX) ) { + MSG("fd %i is larger than the maximum number of open files; ignoring it.\n", fd2); + errno = -EBADF; + return RETT_NO_PASS_KERN; + } + + if(!_fd_intercept_lookup[file]) { + return RETT_PASS_KERN; + } + + if(file == fd2) + { + DEBUG("Input and output files were the same (%i)\n", file); + *result = file; + return RETT_NO_PASS_KERN; + } + + struct NVFile* nvf = &_nvp_fd_lookup[file]; + struct NVFile* nvf2 = &_nvp_fd_lookup[fd2]; + + if (nvf->posix) { + DEBUG("Call posix DUP2 for fd %d\n", nvf->fd); + nvf2->posix = nvf->posix; + ret = syscall_no_intercept(SYS_dup2, file, fd2); + nvf2->fd = ret; + *result = ret; + return RETT_NO_PASS_KERN; + } + + if(file > fd2) + { + NVP_LOCK_FD_WR(nvf); + NVP_LOCK_FD_WR(nvf2); + } else { + NVP_LOCK_FD_WR(nvf2); + NVP_LOCK_FD_WR(nvf); + } + + if( (!nvf->valid)||(!nvf2->valid) ) { + // errno = EBADF; // TODO: Uncomment this? + DEBUG("Invalid FD1 %i or FD2 %i\n", file, fd2); +// NVP_UNLOCK_FD_WR(nvf); +// NVP_UNLOCK_FD_WR(nvf2); + } + + if(nvf->node == nvf2->node || !nvf2->node) { + NVP_LOCK_NODE_WR(nvf); + } else { + if(nvf->node > nvf2->node) { + NVP_LOCK_NODE_WR(nvf); + NVP_LOCK_NODE_WR(nvf2); + } else { + NVP_LOCK_NODE_WR(nvf2); + NVP_LOCK_NODE_WR(nvf); + } + } + + ret = syscall_no_intercept(SYS_dup2, file, fd2); + + if(ret < 0) + { + DEBUG("_nvp_DUP2 failed to %s " + "(returned %i): %s\n", __func__, file, + fd2, ret, strerror(-ret)); + NVP_UNLOCK_NODE_WR(nvf); + if(nvf->node != nvf2->node) { NVP_UNLOCK_NODE_WR(nvf2); } + NVP_UNLOCK_FD_WR(nvf); + NVP_UNLOCK_FD_WR(nvf2); + //GLOBAL_UNLOCK_WR(); + *result = -ret; + return RETT_NO_PASS_KERN; + } + else + { + //free(nvf2->offset); // TODO: free this iff it's not in use anymore to avoid memory leaks + } + + nvf2->valid = 0; + + if(nvf2->node && nvf->node != nvf2->node) { NVP_UNLOCK_NODE_WR(nvf2); } + + _nvp_test_invalidate_node(nvf2); + + if(ret != fd2) + { + WARNING("ret of _nvp_DUP2(%i, %i) didn't return the fd2 " + "that was just closed. Technically this doesn't " + "violate POSIX, but I DON'T LIKE IT. " + "(Got %i, expected %i)\n", + file, fd2, ret, fd2); + assert(0); + + NVP_UNLOCK_FD_WR(nvf2); + + nvf2 = &_nvp_fd_lookup[ret]; + + NVP_LOCK_FD_WR(nvf2); + + if(nvf2->valid) + { + DEBUG("%s->DUP2 returned a ret which corresponds " + "to an already open NVFile! dup2(%i, %i) " + "returned %i\n", __func__, + file, fd2, ret); + assert(0); + } + } + + nvf2->fd = ret; + nvf2->offset = nvf->offset; + nvf2->canRead = nvf->canRead; + nvf2->canWrite = nvf->canWrite; + nvf2->append = nvf->append; + nvf2->aligned = nvf->aligned; + nvf2->serialno = nvf->serialno; + nvf2->node = nvf->node; + nvf2->valid = nvf->valid; + nvf2->posix = nvf->posix; + // Increment the refernce count as this file + // descriptor is pointing to the same NVFNode + nvf2->node->reference++; + + SANITYCHECK(nvf2->node != NULL); + SANITYCHECK(nvf2->valid); + + DEBUG("fd2 should now match fd1. " + "Testing to make sure this is true.\n"); + + NVP_CHECK_NVF_VALID_WR(nvf2); + + NVP_UNLOCK_NODE_WR(nvf); // nvf2 was already unlocked. old nvf2 was not the same node, but new nvf2 shares a node with nvf1 + NVP_UNLOCK_FD_WR(nvf2); + NVP_UNLOCK_FD_WR(nvf); + + *result = nvf2->fd; + return RETT_NO_PASS_KERN; +} diff --git a/splitfs_syscall_intercept/src/execve.c b/splitfs_syscall_intercept/src/execve.c new file mode 100644 index 0000000000..0ae94502e9 --- /dev/null +++ b/splitfs_syscall_intercept/src/execve.c @@ -0,0 +1,167 @@ +#include +#include +#include + +#include "file.h" +#include "execve.h" +#include "stack.h" +#include "handle_mmaps.h" + +RETT_SYSCALL_INTERCEPT _sfs_EXECVE(INTF_SYSCALL) { + + int exec_ledger_fd = -1, i = 0; + unsigned long offset_in_map = 0; + int pid = getpid(); + char exec_nvp_filename[BUF_SIZE]; + + for (i = 0; i < 1024; i++) { + if (_nvp_fd_lookup[i].offset != NULL) + execve_fd_passing[i] = *(_nvp_fd_lookup[i].offset); + else + execve_fd_passing[i] = 0; + } + + sprintf(exec_nvp_filename, "exec-ledger-%d", pid); + exec_ledger_fd = shm_open(exec_nvp_filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (exec_ledger_fd == -1) { + printf("%s: %s\n", __func__, strerror(errno)); + assert(0); + } + + int res = syscall_no_intercept(SYS_ftruncate, exec_ledger_fd, (10*1024*1024)); + if (res <= -1) { + printf("%s: ftruncate failed. Err = %s\n", __func__, strerror(-res)); + assert(0); + } + + char *shm_area = mmap(NULL, 10*1024*1024, PROT_READ | PROT_WRITE, MAP_SHARED, exec_ledger_fd, 0); + if (shm_area == NULL) { + printf("%s: mmap failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + if (memcpy(shm_area + offset_in_map, _nvp_fd_lookup, 1024 * sizeof(struct NVFile)) == NULL) { + printf("%s: memcpy of fd lookup failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + offset_in_map += (1024 * sizeof(struct NVFile)); + + if (memcpy(shm_area + offset_in_map, execve_fd_passing, 1024 * sizeof(int)) == NULL) { + printf("%s: memcpy of execve offset failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + offset_in_map += (1024 * sizeof(int)); + + + if (memcpy(shm_area + offset_in_map, _nvp_node_lookup[0], 1024*sizeof(struct NVNode)) == NULL) { + printf("%s: memcpy of node lookup failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + offset_in_map += (1024*sizeof(struct NVNode)); + + if (memcpy(shm_area + offset_in_map, _nvp_ino_lookup, 1024 * sizeof(int)) == NULL) { + printf("%s: memcpy of ino lookup failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + offset_in_map += (1024 * sizeof(int)); + + if (memcpy(shm_area + offset_in_map, _nvp_free_node_list[0], 1024*sizeof(struct StackNode)) == NULL) { + printf("%s: memcpy of free node list failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + nvp_free_dr_mmaps(); + offset_in_map += (1024 * sizeof(struct StackNode)); + + return RETT_PASS_KERN; +} + +void _sfs_SHM_COPY() { + + int exec_ledger_fd = -1; + int i,j; + unsigned long offset_in_map = 0; + int pid = getpid(); + char exec_nvp_filename[BUF_SIZE]; + + sprintf(exec_nvp_filename, "exec-ledger-%d", pid); + exec_ledger_fd = shm_open(exec_nvp_filename, O_RDONLY, 0666); + + if (exec_ledger_fd == -1) { + printf("%s: shm_open failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + char *shm_area = mmap(NULL, 10*1024*1024, PROT_READ, MAP_SHARED, exec_ledger_fd, 0); + if (shm_area == NULL) { + printf("%s: mmap failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + if (memcpy(_nvp_fd_lookup, shm_area + offset_in_map, 1024 * sizeof(struct NVFile)) == NULL) { + printf("%s: memcpy of fd lookup failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + offset_in_map += (1024 * sizeof(struct NVFile)); + + if (memcpy(execve_fd_passing, shm_area + offset_in_map, 1024 * sizeof(int)) == NULL) { + printf("%s: memcpy of offset passing failed. Err = %s\n", __func__, strerror(errno)); + } + + offset_in_map += (1024 * sizeof(int)); + + for (i = 0; i < 1024; i++) { + _nvp_fd_lookup[i].offset = (size_t*)calloc(1, sizeof(int)); + *(_nvp_fd_lookup[i].offset) = execve_fd_passing[i]; + } + + if (memcpy(_nvp_node_lookup[0], shm_area + offset_in_map, 1024*sizeof(struct NVNode)) == NULL) { + printf("%s: memcpy of node lookup failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + for (i = 0; i < 1024; i++) { + _nvp_fd_lookup[i].node = NULL; + _nvp_node_lookup[0][i].root_dirty_num = 0; + _nvp_node_lookup[0][i].total_dirty_mmaps = 0; + _nvp_node_lookup[0][i].isRootSet = 0; + _nvp_node_lookup[0][i].height = 0; + _nvp_node_lookup[0][i].root_dirty_num = 0; + + _nvp_node_lookup[0][i].root = _nvp_backup_roots[0][i].root; + _nvp_node_lookup[0][i].merkle_root = _nvp_backup_roots[0][i].merkle_root; + } + + offset_in_map += (1024*sizeof(struct NVNode)); + + for (i = 0; i < 1024; i++) { + if (_nvp_fd_lookup[i].fd != -1) { + for (j = 0; j < 1024; j++) { + if (_nvp_fd_lookup[i].serialno == _nvp_node_lookup[0][j].serialno) { + _nvp_fd_lookup[i].node = &_nvp_node_lookup[0][j]; + break; + } + } + } + } + + if (memcpy(_nvp_ino_lookup, shm_area + offset_in_map, 1024 * sizeof(int)) == NULL) { + printf("%s: memcpy of ino lookup failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + offset_in_map += (1024 * sizeof(int)); + + if (memcpy(_nvp_free_node_list[0], shm_area + offset_in_map, 1024*sizeof(struct StackNode)) == NULL) { + printf("%s: memcpy of free node list failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + munmap(shm_area, 10*1024*1024); + shm_unlink(exec_nvp_filename); +} \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/execve.h b/splitfs_syscall_intercept/src/execve.h new file mode 100644 index 0000000000..400562bd4d --- /dev/null +++ b/splitfs_syscall_intercept/src/execve.h @@ -0,0 +1,3 @@ +int execve_fd_passing[1024]; + +void _sfs_SHM_COPY(); \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/file.c b/splitfs_syscall_intercept/src/file.c new file mode 100644 index 0000000000..55ab64675a --- /dev/null +++ b/splitfs_syscall_intercept/src/file.c @@ -0,0 +1,840 @@ +// a module which repalces the standart POSIX functions with memory mapped equivalents +#include + +#include "file.h" +// #include "perfcount.h" +#include "timers.h" +#include "bg_clear_mmap.h" +#include "stack.h" +#include "add_delay.h" +#include "log.h" +#include "tbl_mmaps.h" +#include "nvp_lock.h" +#include "lru_cache.h" +#include "thread_handle.h" + +#include "mmap_cache.h" +#include "handle_mmaps.h" +#include "fsync.h" +#include "stack.h" +#include "fsync.h" + +struct NVNode * nvp_allocate_node(int list_idx) +{ + struct NVNode *node = NULL; + int idx_in_list = -1; + int i, candidate = -1; + + idx_in_list = pop_from_stack(1, 0, list_idx); + if(idx_in_list != -1) { + node = &_nvp_node_lookup[list_idx][idx_in_list]; + node->index_in_free_list = idx_in_list; + return node; + } + /* + * Get the first unused NVNode from the global array of 1024 NVNodes. + * If the node is not unusued but the reference number is + * 0, meaning that there is no thread that has this file open, + * it can be used for holding info of the new file + */ + for (i = 0; i < 1024; i++) { + if (_nvp_node_lookup[list_idx][i].serialno == 0) { + DEBUG("Allocate unused node %d\n", i); + _nvp_free_node_list[list_idx][i].free_bit = 0; + node = &_nvp_node_lookup[list_idx][i]; + node->index_in_free_list = i; + _nvp_free_node_list_head[list_idx] = _nvp_free_node_list[list_idx][node->index_in_free_list].next_free_idx; + break; + } + if (candidate == -1 && _nvp_node_lookup[list_idx][i].reference == 0) + candidate = i; + } + if (node) { + return node; + } + if (candidate != -1) { + node = &_nvp_node_lookup[list_idx][candidate]; + DEBUG("Allocate unreferenced node %d\n", candidate); + node->index_in_free_list = candidate; + _nvp_free_node_list[list_idx][candidate].free_bit = 0; + _nvp_free_node_list_head[list_idx] = _nvp_free_node_list[list_idx][candidate].next_free_idx; + return node; + } + return NULL; +} + +static struct NVNode * nvp_get_node(const char *path, struct stat *file_st, int result) +{ + int i, index, ret; + struct NVNode *node = NULL; + int node_list_idx = pthread_self() % NUM_NODE_LISTS; + instrumentation_type node_lookup_lock_time, nvnode_lock_time; + + pthread_spin_lock(&node_lookup_lock[node_list_idx]); + /* + * Checking to see if the file is already open by another thread. In this case, the same NVNode can be used by this thread + * too. But it will have its own separate NVFile, since the fd is unique per thread + */ + index = file_st->st_ino % 1024; + if (_nvp_ino_lookup[index]) { + i = _nvp_ino_lookup[index]; + if ( _nvp_fd_lookup[i].node && + _nvp_fd_lookup[i].node->serialno == file_st->st_ino) { + DEBUG("File %s is (or was) already open in fd %i " + "(this fd hasn't been __open'ed yet)! " + "Sharing nodes.\n", path, i); + + node = _nvp_fd_lookup[i].node; + SANITYCHECK(node != NULL); + NVP_LOCK_WR(node->lock); + node->reference++; + NVP_LOCK_UNLOCK_WR(node->lock); + + pthread_spin_unlock(&node_lookup_lock[node_list_idx]); + goto out; + } + } + /* + * This is the first time the file is getting opened. + * The first unused NVNode is assigned here to hold info of the file. + */ + if(node == NULL) { + DEBUG("File %s is not already open. " + "Allocating new NVNode.\n", path); + node = nvp_allocate_node(node_list_idx); + NVP_LOCK_WR(node->lock); + node->serialno = file_st->st_ino; + node->reference++; + NVP_LOCK_UNLOCK_WR(node->lock); + if(UNLIKELY(!node)) { + MSG("%s: Node is null\n", __func__); + assert(0); + } + } + index = file_st->st_ino % 1024; + if (_nvp_ino_lookup[index] == 0) + _nvp_ino_lookup[index] = result; + + node->free_list_idx = node_list_idx; + + pthread_spin_unlock(&node_lookup_lock[node_list_idx]); + + NVP_LOCK_WR(node->lock); + + /* + * Checking if the mapping exists in the global mmap() cache for this inode number. + * If it does, copy all the mapping + * from the global mmap() cache on to the NVNode mmap() + */ + nvp_add_to_inode_mapping(node, node->backup_serialno); + nvp_reset_mappings(node); + ret = nvp_retrieve_inode_mapping(node); + if(ret != 0) { + /* + * If the height is not 0, that means that there exist levels + * in the file backed mmap() tree. So need to free + * the file backed mmap() tree completely. + */ + if(node->height != 0) + nvp_cleanup_node(node, 0, 1); + } + node->length = file_st->st_size; + node->maplength = 0; + node->true_length = node->length; + if (node->true_length >= LARGE_FILE_THRESHOLD) + node->is_large_file = 1; + else + node->is_large_file = 0; + node->dr_mem_used = 0; + if (node->true_length == 0) { + clear_tbl_mmap_entry(&_nvp_tbl_mmaps[file_st->st_ino % APPEND_TBL_MAX], NUM_APP_TBL_MMAP_ENTRIES); +#if DATA_JOURNALING_ENABLED + + clear_tbl_mmap_entry(&_nvp_over_tbl_mmaps[file_st->st_ino % OVER_TBL_MAX], NUM_OVER_TBL_MMAP_ENTRIES); + +#endif // DATA_JOURNALING_ENABLED + + } + + if(node->dr_info.start_addr != 0 || node->dr_over_info.start_addr != 0) { + DEBUG_FILE("%s: calling transfer to free pool. Inode = %lu\n", __func__, node->serialno); + nvp_transfer_to_free_dr_pool(node); + } + node->async_file_close = 0; + node->backup_serialno = node->serialno; + + NVP_LOCK_UNLOCK_WR(node->lock); +out: + return node; +} + +/** + * Do the preprocessing for open call. + * + * 1. Check if 'path' is a valid pointer + * 2. Check if the path is on the PM mount, else passthru (to kernel). + * 3. If the file is not present and is not set to be created, passthru. + * 4. If file is not a regular or block file, passthru. + * + * Return value: Determines if it should be passed through to the kernel. + * Return value - int* error: Indicates any error that needs to be passed back to the user via errno. This value is negative of errno + */ +RETT_SYSCALL_INTERCEPT _sfs_OPEN_preprocess(char *path, int oflag, int* error) { + int access_result; + + *error = 0; + + access_result = syscall_no_intercept(SYS_access, path, F_OK); + /** + * Before we derefernece 'path' pointer, we need to check if it is a valid pointer, but not crash it + * (segfault) if it's invalid. + * + * Since it is not possible to validate a pointer in the user-space, + * we are making an access system call which validates + * the pointer. + */ + if(access_result == -EFAULT) { + *error = -EFAULT; + return RETT_NO_PASS_KERN; + } + + /** + * In case absolute path is specified, check if it belongs to the persistent memory + * mount and only then use SplitFS, else redirect to POSIX + */ + if(path[0] == '/') { + int len = strlen(NVMM_PATH); + char dest[len + 1]; + dest[len] = '\0'; + strncpy(dest, path, len); + + if(strcmp(dest, NVMM_PATH) != 0) { + // If not pmem mount then do not intercept + MSG("Not a pmem file, passing through to kernel\n"); + return RETT_PASS_KERN; + } + } + + if(access_result && !FLAGS_INCLUDE(oflag, O_CREAT)) + { + DEBUG("%s: File does not exist and is not set to be created. Passing to kernel\n", __func__); + return RETT_PASS_KERN; + + } else { + // file exists + struct stat file_st; + + int stat_ret = syscall_no_intercept(SYS_stat, path, &file_st); + if(stat_ret < 0) { + DEBUG("%s: failed to get device stats for \"%s\" (error: %s). Passing to kernel\n", __func__, + path, strerror(-stat_ret)); + return RETT_PASS_KERN; + } + else if(S_ISREG(file_st.st_mode)) { + DEBUG("%s: file exists and is a regular file.", __func__); + } + else if (S_ISBLK(file_st.st_mode)) { + DEBUG("%s: file exists and is a block device.", __func__); + } + else + { + DEBUG("%s: file exists and is not a regular or block file. Passing to kernel\n", __func__); + return RETT_PASS_KERN; + } + } + + return RETT_NO_PASS_KERN; +} + +RETT_SYSCALL_INTERCEPT _sfs_OPEN(INTF_SYSCALL) { + char *path; + int oflag, mode, access_result, fd, error, pp_ret; + +#if BG_CLOSING + int closed_filedesc = -1, hash_index = -1; + fd = -1; +#if SEQ_LIST || RAND_LIST + struct ClosedFiles *clf = NULL; +#else // SEQ_LIST || RAND_LIST + struct InodeClosedFile *tbl = NULL; +#endif // SEQ_LIST || RAND_LIST +#endif // BG_CLOSING + + // Parse the syscall args + path = (char *)arg0; + oflag = (int)arg1; + + + pp_ret = _sfs_OPEN_preprocess(path, oflag, &error); + + // Passthru + if(pp_ret == RETT_PASS_KERN) { + MSG("Passing through OPEN call to kernel\n"); + return RETT_PASS_KERN; + } + + // Error + don't passthru + if(error != 0) { + *result = error; + return RETT_NO_PASS_KERN; + } + + // Start _nvp_OPEN impl here. + mode = (int)arg2; + + int sfs_result; + instrumentation_type open_time, clf_lock_time, nvnode_lock_time; + + START_TIMING(open_t, open_time); + GLOBAL_LOCK_WR(); + + DEBUG_FILE("_nvp_OPEN(%s)\n", path); + num_open++; + + DEBUG("Attempting to _nvp_OPEN the file \"%s\" with the following " + "flags (0x%X): ", path, oflag); + + /* + * Print all the flags passed to open() + */ + if((oflag&O_RDWR)||((oflag&O_RDONLY)&&(oflag&O_WRONLY))) { + DEBUG_P("O_RDWR "); + } else if(FLAGS_INCLUDE(oflag,O_WRONLY)) { + DEBUG_P("O_WRONLY "); + } else if(FLAGS_INCLUDE(oflag, O_RDONLY)) { + DEBUG_P("O_RDONLY "); + } + DUMP_FLAGS(oflag,O_APPEND); + DUMP_FLAGS(oflag,O_CREAT); + DUMP_FLAGS(oflag,O_TRUNC); + DUMP_FLAGS(oflag,O_EXCL); + DUMP_FLAGS(oflag,O_SYNC); + DUMP_FLAGS(oflag,O_ASYNC); + DUMP_FLAGS(oflag,O_DSYNC); + DUMP_FLAGS(oflag,O_FSYNC); + DUMP_FLAGS(oflag,O_RSYNC); + DUMP_FLAGS(oflag,O_NOCTTY); + DUMP_FLAGS(oflag,O_NDELAY); + DUMP_FLAGS(oflag,O_NONBLOCK); + DUMP_FLAGS(oflag,O_DIRECTORY); + DUMP_FLAGS(oflag,O_LARGEFILE); + DUMP_FLAGS(oflag,O_NOATIME); + DUMP_FLAGS(oflag,O_DIRECT); + DUMP_FLAGS(oflag,O_NOFOLLOW); + DEBUG_P("\n"); + + struct stat file_st; + // Initialize NVNode + struct NVNode* node = NULL; + +#if BG_CLOSING + if (async_close_enable) { + if(num_files_closed >= 800 || (dr_mem_closed_files >= ((5ULL) * 1024 * 1024 * 1024))) { + ASYNC_CLOSING = 0; + checkAndActivateBgThread(); + } + } +#endif + + /* + * If creation of the file is involved, 3 parameters must be passed to open(). + * Otherwise, 2 parameters must be passed + */ + if (FLAGS_INCLUDE(oflag, O_CREAT)) + { + instrumentation_type op_log_entry_time; + // Open system call is done here + DEBUG_FILE("%s: calling open with path = %s, flag = %d, mode = %d, ino addr = %p, ino size addr = %p\n", __func__, path, oflag, mode, &file_st.st_ino, &file_st.st_size); + fd = syscall_no_intercept(SYS_open, path, oflag & (~O_APPEND), mode); +#if !POSIX_ENABLED + if (fd >= 0) { + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_FILE_CREATE, + path, + NULL, + mode, + oflag); + END_TIMING(op_log_entry_t, op_log_entry_time); + } +#endif + } else { + DEBUG_FILE("%s: calling open with path = %s, flag = %d, mode = 0666, ino addr = %p, ino size addr = %p\n", __func__, path, oflag, &file_st.st_ino, &file_st.st_size); + fd = syscall_no_intercept(SYS_open, path, oflag & (~O_APPEND), 0666); + } + if(fd<0) + { + DEBUG("%s: Kernel open failed: %s\n", __func__, strerror(errno)); + END_TIMING(open_t, open_time); + GLOBAL_UNLOCK_WR(); + *result = fd; + return RETT_NO_PASS_KERN; + } + DEBUG_FILE("%s:(%s), fd = %d\n",__func__, path, fd); + SANITYCHECK(&_nvp_fd_lookup[fd] != NULL); + struct NVFile* nvf = NULL; + syscall_no_intercept(SYS_stat, path, &file_st); + +#if BG_CLOSING + if (async_close_enable) + checkAndActivateBgThread(); + GLOBAL_LOCK_CLOSE_WR(); + hash_index = file_st.st_ino % TOTAL_CLOSED_INODES; +#if SEQ_LIST || RAND_LIST + clf = &_nvp_closed_files[hash_index]; + + LRU_NODE_LOCK_WR(clf); + + fd = remove_from_seq_list_hash(clf, file_st.st_ino); +#else // SEQ_LIST || RAND_LIST + tbl = &inode_to_closed_file[hash_index]; + NVP_LOCK_HASH_TABLE_WR(tbl); + fd = remove_from_lru_list_hash(file_st.st_ino, 0); +#endif // SEQ_LIST || RAND_LIST + if(fd >= 0) { + if ((oflag & O_RDWR) || FLAGS_INCLUDE(oflag, O_RDONLY)) { + num_close++; + closed_filedesc = fd; + __atomic_fetch_sub(&num_files_closed, 1, __ATOMIC_SEQ_CST); +#if SEQ_LIST || RAND_LIST + LRU_NODE_UNLOCK_WR(clf); +#else // SEQ_LIST || RAND_LIST + NVP_UNLOCK_HASH_TABLE_WR(tbl); +#endif // SEQ_LIST || RAND_LIST + GLOBAL_UNLOCK_CLOSE_WR(); + + syscall_no_intercept(SYS_close, *result); + *result = closed_filedesc; + nvf = &_nvp_fd_lookup[*result]; + node = nvf->node; + __atomic_fetch_sub(&dr_mem_closed_files, nvf->node->dr_mem_used, __ATOMIC_SEQ_CST); + NVP_LOCK_FD_WR(nvf); + NVP_LOCK_NODE_WR(nvf); + nvf->valid = 0; + goto initialize_nvf; + } + } + +#if SEQ_LIST || RAND_LIST + LRU_NODE_UNLOCK_WR(clf); +#else // SEQ_LIST || RAND_LIST + NVP_UNLOCK_HASH_TABLE_WR(tbl); +#endif // SEQ_LIST || RAND_LIST + GLOBAL_UNLOCK_CLOSE_WR(); +#endif // BG_CLOSING + + // Retrieving the NVFile corresponding to the file descriptor returned by open() system call + nvf = &_nvp_fd_lookup[fd]; + DEBUG("%s: succeeded for path %s: fd %i returned. " + "filling in file info\n",__func__, path, *result); + + NVP_LOCK_FD_WR(nvf); + + // Check if the file descriptor is already open. If open, something is wrong and return error + if(nvf->valid) + { + MSG("There is already a file open with that FD (%i)!\n", *result); + assert(0); + END_TIMING(open_t, open_time); + GLOBAL_UNLOCK_WR(); + *result = fd; + return RETT_NO_PASS_KERN; + } + _fd_intercept_lookup[fd] = true; + + /* + * NVNode is retrieved here. Keeping this check because in quill it was required. Not necessary in Ledger + */ + if(node == NULL) + { + // Find or allocate a NVNode + node = nvp_get_node(path, &file_st, fd); + NVP_LOCK_WR(node->lock); + } + +#if BG_CLOSING + initialize_nvf: +#endif // BG_CLOSING + nvf->fd = fd; + nvf->node = node; + nvf->posix = 0; + nvf->serialno = file_st.st_ino; + /* + * Write the entry of this file into the global inode number struct. + * This contains the fd of the thread that first + * opened this file. + */ + // Set FD permissions + if((oflag & O_RDWR)||((oflag & O_RDONLY) && (oflag & O_WRONLY))) { + DEBUG("oflag (%i) specifies O_RDWR for fd %i\n", oflag, result); + nvf->canRead = 1; + nvf->canWrite = 1; + } else if(oflag&O_WRONLY) { + +#if WORKLOAD_TAR | WORKLOAD_GIT | WORKLOAD_RSYNC + nvf->posix = 0; + nvf->canRead = 1; + nvf->canWrite = 1; +#else // WORKLOAD_TAR + + MSG("File %s is opened O_WRONLY.\n", path); + MSG("Does not support mmap, use posix instead.\n"); + nvf->posix = 1; + nvf->canRead = 0; + nvf->canWrite = 1; + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_WR(nvf); + END_TIMING(open_t, open_time); + GLOBAL_UNLOCK_WR(); + *result = nvf->fd; + return 0; +#endif // WORKLOAD_TAR + + } else if(FLAGS_INCLUDE(oflag, O_RDONLY)) { + DEBUG("oflag (%i) specifies O_RDONLY for fd %i\n", + oflag, result); + nvf->canRead = 1; + nvf->canWrite = 0; + } else { + DEBUG("File permissions don't include read or write!\n"); + nvf->canRead = 0; + nvf->canWrite = 0; + assert(0); + } + + if(FLAGS_INCLUDE(oflag, O_APPEND)) { + nvf->append = 1; + } else { + nvf->append = 0; + } + + SANITYCHECK(nvf->node != NULL); + if(FLAGS_INCLUDE(oflag, O_TRUNC) && nvf->node->length) + { + DEBUG("We just opened a file with O_TRUNC that was already " + "open with nonzero length %li. Updating length.\n", + nvf->node->length); + nvf->node->length = 0; + } + nvf->posix = 0; + nvf->debug = 0; + + /* For BDB log file, workaround the fdsync issue */ + if (path[29] == 'l' && path[30] == 'o' && path[31] == 'g') { + nvf->debug = 1; + } + + nvf->offset = (size_t*)calloc(1, sizeof(int)); + *nvf->offset = 0; + + if(FLAGS_INCLUDE(oflag, O_DIRECT) && (DO_ALIGNMENT_CHECKS)) { + nvf->aligned = 1; + } else { + nvf->aligned = 0; + } + + nvf->valid = 1; + + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_WR(nvf); + + END_TIMING(open_t, open_time); + + GLOBAL_UNLOCK_WR(); + *result = nvf->fd; + return RETT_NO_PASS_KERN; +} + +RETT_SYSCALL_INTERCEPT _sfs_REAL_CLOSE(int file, ino_t serialno, int async_file_closing, long* result) { + + instrumentation_type node_lookup_lock_time, nvnode_lock_time, close_syscall_time, + copy_to_dr_pool_time, copy_to_mmap_cache_time, give_up_node_time; + int cpuid; + int node_list_idx; + + *result = 0; + + if (file < 0) { + *result = -EBADF; + return RETT_NO_PASS_KERN; + } + + struct NVFile* nvf = &_nvp_fd_lookup[file]; + num_close++; + if (nvf->posix) { + nvf->valid = 0; + nvf->posix = 0; + NVP_LOCK_NODE_WR(nvf); + nvf->node->reference--; + NVP_UNLOCK_NODE_WR(nvf); + if (nvf->node->reference == 0) { + nvf->node->serialno = 0; + int index = nvf->serialno % 1024; + _nvp_ino_lookup[index] = 0; + } + nvf->serialno = 0; + DEBUG("Call posix CLOSE for fd %d\n", nvf->fd); + return RETT_PASS_KERN; + } + + DEBUG_FILE("%s(%i): Ref count = %d\n", __func__, file, nvf->node->reference); + DEBUG_FILE("%s: Calling fsync flush on fsync\n", __func__); + cpuid = GET_CPUID(); +#if !SYSCALL_APPENDS + fsync_flush_on_fsync(nvf, cpuid, 1, 0); +#endif + /* + * nvf->node->reference contains how many threads have this file open. + */ + node_list_idx = nvf->node->free_list_idx; + + pthread_spin_lock(&node_lookup_lock[node_list_idx]); + + if(nvf->valid == 0) { + pthread_spin_unlock(&node_lookup_lock[node_list_idx]); + *result = -1; + return RETT_NO_PASS_KERN; + } + if(nvf->node->reference <= 0) { + pthread_spin_unlock(&node_lookup_lock[node_list_idx]); + *result = -1; + return RETT_NO_PASS_KERN; + } + if(nvf->node->serialno != serialno) { + pthread_spin_unlock(&node_lookup_lock[node_list_idx]); + *result = -1; + return RETT_NO_PASS_KERN; + } + + NVP_LOCK_NODE_WR(nvf); + nvf->node->reference--; + NVP_UNLOCK_NODE_WR(nvf); + + if (nvf->node->reference == 0) { + nvf->node->serialno = 0; + push_in_stack(1, 0, nvf->node->index_in_free_list, node_list_idx); + } + if (async_file_closing) { + nvf->node->async_file_close = 1; + } + pthread_spin_unlock(&node_lookup_lock[node_list_idx]); + + NVP_LOCK_FD_WR(nvf); + NVP_CHECK_NVF_VALID_WR(nvf); + NVP_LOCK_NODE_WR(nvf); + + // setting valid to 0 means that this fd is not open. So can be used for a subsequent open of same or different file. + if(nvf->valid == 0) { + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_WR(nvf); + *result = -1; + return RETT_NO_PASS_KERN; + } + if(nvf->node->reference < 0) { + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_WR(nvf); + *result = -1; + return RETT_NO_PASS_KERN; + } + if(nvf->serialno != serialno) { + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_WR(nvf); + *result = -1; + return RETT_NO_PASS_KERN; + } + + nvf->valid = 0; + if (nvf->node->reference == 0) { + nvp_add_to_inode_mapping(nvf->node, nvf->serialno); + nvf->node->backup_serialno = 0; + int index = nvf->serialno % 1024; + _nvp_ino_lookup[index] = 0; + DEBUG("Close Cleanup node for %d\n", file); + if(nvf->node->dr_info.start_addr != 0 || nvf->node->dr_over_info.start_addr != 0) { + nvp_transfer_to_free_dr_pool(nvf->node); + } + nvf->node->async_file_close = 0; + nvp_cleanup_node(nvf->node, 0, 0); + } + nvf->serialno = 0; + + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_WR(nvf); + + // close() system call of the file is done here. + //START_TIMING(close_syscall_t, close_syscall_time); + return RETT_NO_PASS_KERN; +} + + +RETT_SYSCALL_INTERCEPT _sfs_CLOSE(INTF_SYSCALL) { + int file = (int)arg0; + + DEBUG_FILE("%s: fd = %d\n", __func__, file); + + if( (file<0) || (file >= OPEN_MAX) ) { + DEBUG("fd %i is larger than the maximum number of open files; ignoring it.\n", file); + errno = EBADF; + return -1; + } + + if(!_fd_intercept_lookup[file]) { + return RETT_PASS_KERN; + } + _fd_intercept_lookup[file] = false; + + RETT_SYSCALL_INTERCEPT rc_res; + ino_t serialno; + struct NVFile* nvf = NULL; + instrumentation_type close_time; + +#if BG_CLOSING + instrumentation_type clf_lock_time; + int previous_closed_filedesc = -1; + ino_t previous_closed_serialno = 0, stale_serialno = 0; + int cpuid, stale_fd = -1; + int hash_index = -1; +#if SEQ_LIST || RAND_LIST + struct ClosedFiles *clf = NULL; +#else //SEQ_LIST || RAND_LIST + struct InodeClosedFile *tbl = NULL; +#endif //SEQ_LIST || RAND_LIST + + //num_close++; + // Get the struct NVFile from the file descriptor + + nvf = &_nvp_fd_lookup[file]; + + if (nvf->posix) { + nvf->valid = 0; + nvf->posix = 0; + NVP_LOCK_NODE_WR(nvf); + nvf->node->reference--; + NVP_UNLOCK_NODE_WR(nvf); + if (nvf->node->reference == 0) { + nvf->node->serialno = 0; + int index = nvf->serialno % 1024; + _nvp_ino_lookup[index] = 0; + } + nvf->serialno = 0; + DEBUG("Call posix CLOSE for fd %d\n", nvf->fd); + *result = syscall_no_intercept(SYS_close, file); + END_TIMING(close_t, close_time); + GLOBAL_UNLOCK_WR(); + return RETT_NO_PASS_KERN; + } + + serialno = nvf->node->serialno; + GLOBAL_LOCK_CLOSE_WR(); + + hash_index = serialno % TOTAL_CLOSED_INODES; + +#if SEQ_LIST || RAND_LIST + clf = &_nvp_closed_files[hash_index]; + + //START_TIMING(clf_lock_t, clf_lock_time); + LRU_NODE_LOCK_WR(clf); + //END_TIMING(clf_lock_t, clf_lock_time); +#else //SEQ_LIST || RAND_LIST + tbl = &inode_to_closed_file[hash_index]; + NVP_LOCK_HASH_TABLE_WR(tbl); +#endif //SEQ_LIST || RAND_LIST + cpuid = GET_CPUID(); + NVP_LOCK_NODE_RD(nvf, cpuid); + + if(nvf->node->reference == 1) { + NVP_UNLOCK_NODE_RD(nvf, cpuid); + __atomic_fetch_add(&dr_mem_closed_files, nvf->node->dr_mem_used, __ATOMIC_SEQ_CST); +#if SEQ_LIST || RAND_LIST + stale_fd = insert_in_seq_list(clf, &stale_serialno, file, serialno); +#else //SEQ_LIST || RAND_LIST + stale_fd = insert_in_lru_list(file, serialno, &stale_serialno); +#endif //SEQ_LIST || RAND_LIST + if(stale_fd >= 0 && stale_serialno > 0) { + previous_closed_filedesc = stale_fd; + previous_closed_serialno = stale_serialno; + } + + if(previous_closed_filedesc != -1) { + _sfs_REAL_CLOSE(previous_closed_filedesc, previous_closed_serialno, 1, result); + } + else + __atomic_fetch_add(&num_files_closed, 1, __ATOMIC_SEQ_CST); + +#if SEQ_LIST || RAND_LIST + LRU_NODE_UNLOCK_WR(clf); +#else //SEQ_LIST || RAND_LIST + NVP_UNLOCK_HASH_TABLE_WR(tbl); +#endif //SEQ_LIST || RAND_LIST + GLOBAL_UNLOCK_CLOSE_WR(); + + END_TIMING(close_t, close_time); + GLOBAL_UNLOCK_WR(); + *result = 0; + return RETT_NO_PASS_KERN; + } + + NVP_UNLOCK_NODE_RD(nvf, cpuid); +#if SEQ_LIST || RAND_LIST + LRU_NODE_UNLOCK_WR(clf); +#else //SEQ_LIST || RAND_LIST + NVP_UNLOCK_HASH_TABLE_WR(tbl); +#endif //SEQ_LIST || RAND_LIST + GLOBAL_UNLOCK_CLOSE_WR(); +#endif //BG_CLOSING + + START_TIMING(close_t, close_time); + + GLOBAL_LOCK_WR(); + DEBUG_FILE("%s: (%i)\n", __func__, file); + + nvf = &_nvp_fd_lookup[file]; + serialno = nvf->node->serialno; + + rc_res = _sfs_REAL_CLOSE(file, serialno, 0, result); + + END_TIMING(close_t, close_time); + GLOBAL_UNLOCK_WR(); + return rc_res; +} + +void nvp_cleanup_node(struct NVNode *node, int free_root, int unmap_btree) +{ + + unsigned int height = node->height; + unsigned long *root = node->root; + unsigned long *merkle_root = node->merkle_root; + unsigned long *dirty_cache; + int total_dirty_mmaps = node->total_dirty_mmaps; + int root_dirty_num = node->root_dirty_num; + + DEBUG("Cleanup: root 0x%x, height %u\n", root, height); + + if(root_dirty_num > 0) + dirty_cache = node->root_dirty_cache; + else + dirty_cache = NULL; + + if(unmap_btree && node->root_dirty_num) { + // munmap() all the file backed mmap()s of this file. + nvp_free_btree(root, merkle_root, height, dirty_cache, root_dirty_num, total_dirty_mmaps); + } + + /* + * Deallocate everything related to NVNode. This should be done at the end when Ledger is exiting. + */ + if (free_root && node->root[0]) { + free(node->root); + free(node->merkle_root); + free(node->root_dirty_cache); + node->root = NULL; + node->merkle_root = NULL; + node->root_dirty_cache = NULL; + return; + } + // Copy all the DR mmap()s linked to this node, to the global pool of DR mmap()s + /* + * Resetting the file backed mmap addresses, merkle tree addresses and the dirty file backed mmap cache of this node to 0. + */ + if(!unmap_btree) + nvp_reset_mappings(node); +} diff --git a/splitfs_syscall_intercept/src/file.h b/splitfs_syscall_intercept/src/file.h new file mode 100644 index 0000000000..bbd9435071 --- /dev/null +++ b/splitfs_syscall_intercept/src/file.h @@ -0,0 +1,80 @@ +/* + * + * ===================================================================================== + * + * Filename: file.h + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:14:13 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ + +#ifndef SPLITFS_FILE_H +#define SPLITFS_FILE_H + +#include +#include "lfq.h" +#include "nvp_lock.h" +#include "inode.h" + +struct NVFile +{ + NVP_LOCK_DECL; + volatile bool valid; + int fd; + volatile size_t* offset; + bool canRead; + bool canWrite; + bool append; + bool aligned; + ino_t serialno; // duplicated so that iterating doesn't require following every node* + struct NVNode* node; + bool posix; + bool debug; + char padding[200]; + int file_stream_flags; +}; + +struct backupRoots { + unsigned long *root; + unsigned long *merkle_root; + unsigned long *root_dirty_cache; +}; + +#define TOTAL_CLOSED_INODES 4096 + +#define FD_LOCKING 1 +#if FD_LOCKING + +#define NVP_LOCK_FD_RD(nvf, cpuid) NVP_LOCK_RD(nvf->lock, cpuid) +#define NVP_UNLOCK_FD_RD(nvf, cpuid) NVP_LOCK_UNLOCK_RD(nvf->lock, cpuid) +#define NVP_LOCK_FD_WR(nvf) NVP_LOCK_WR( nvf->lock) +#define NVP_UNLOCK_FD_WR(nvf) NVP_LOCK_UNLOCK_WR(nvf->lock) + +#else + +#define NVP_LOCK_FD_RD(nvf, cpuid) {(void)(cpuid);} +#define NVP_UNLOCK_FD_RD(nvf, cpuid) {(void)(cpuid);} +#define NVP_LOCK_FD_WR(nvf) {(void)(nvf->lock);} +#define NVP_UNLOCK_FD_WR(nvf) {(void)(nvf->lock);} + +#endif + +extern struct NVFile* _nvp_fd_lookup; + +// Index by fd no. If true intercept, else +bool* _fd_intercept_lookup; + +void nvp_cleanup_node(struct NVNode *node, int free_root, int unmap_btree); + +RETT_SYSCALL_INTERCEPT _sfs_REAL_CLOSE(int file, ino_t serialno, int async_file_closing, long* result); + +#endif diff --git a/splitfs_syscall_intercept/src/fsync.c b/splitfs_syscall_intercept/src/fsync.c new file mode 100644 index 0000000000..0c1a6a770d --- /dev/null +++ b/splitfs_syscall_intercept/src/fsync.c @@ -0,0 +1,90 @@ +/* + * ===================================================================================== + * + * Filename: fsync.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:22:19 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ + +// required for sched_getcpu (GET_CPUID) +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif +#include + +#include "fsync.h" +#include "nvp_lock.h" +#include "inode.h" +#include "relink.h" +#include "staging.h" +#include "tbl_mmaps.h" +#include "timers.h" +#include "handle_mmaps.h" + +static inline void copy_appends_to_file(struct NVFile* nvf, int close, int fdsync) +{ + if (close && nvf->node->reference > 1) + return; + + swap_extents(nvf, close); + nvp_transfer_to_free_dr_pool(nvf->node); +} + +/* FIXME: untested */ +void fsync_flush_on_fsync(struct NVFile* nvf, int cpuid, int close, int fdsync) +{ + struct NVTable_maps *tbl_app = &_nvp_tbl_mmaps[nvf->node->serialno % APPEND_TBL_MAX]; + +#if DATA_JOURNALING_ENABLED + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[nvf->node->serialno % OVER_TBL_MAX]; +#else + struct NVTable_maps *tbl_over = NULL; +#endif // DATA_JOURNALING_ENABLED + + NVP_LOCK_NODE_WR(nvf); + TBL_ENTRY_LOCK_WR(tbl_app); + TBL_ENTRY_LOCK_WR(tbl_over); + + copy_appends_to_file(nvf, close, fdsync); + + TBL_ENTRY_UNLOCK_WR(tbl_over); + TBL_ENTRY_UNLOCK_WR(tbl_app); + NVP_UNLOCK_NODE_WR(nvf); +} + +RETT_SYSCALL_INTERCEPT _sfs_FSYNC(INTF_SYSCALL) +{ + instrumentation_type fsync_time; + int cpuid = -1; + int file; + + file = (int)arg0; + + if(!_fd_intercept_lookup[file]) { + return RETT_PASS_KERN; + } + + START_TIMING(fsync_t, fsync_time); + GLOBAL_LOCK_WR(); + + // Retrieve the NVFile from the global array of NVFiles + cpuid = GET_CPUID(); + struct NVFile* nvf = &_nvp_fd_lookup[file]; + // This goes to fsync_flush_on_fsync() + fsync_flush_on_fsync(nvf, cpuid, 0, 0); + num_fsync++; + END_TIMING(fsync_t, fsync_time); + GLOBAL_UNLOCK_WR(); + *result = 0; + return RETT_NO_PASS_KERN; +} \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/fsync.h b/splitfs_syscall_intercept/src/fsync.h new file mode 100644 index 0000000000..7215204c39 --- /dev/null +++ b/splitfs_syscall_intercept/src/fsync.h @@ -0,0 +1,37 @@ +/* + * ===================================================================================== + * + * Filename: fsync.h + * + * Description: + * + * Version: 1.0 + * Created: 09/28/2019 11:42:45 AM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ + +#ifndef SPLITFS_FSYNC_H +#define SPLITFS_FSYNC_H + +#include +#include "file.h" + +#define FSYNC_POLICY_NONE 0 +#define FSYNC_POLICY_FLUSH_ON_FSYNC 1 +#define FSYNC_POLICY_UNCACHEABLE_MAP 2 +#define FSYNC_POLICY_NONTEMPORAL_WRITES 3 +#define FSYNC_POLICY_FLUSH_ON_WRITE 4 + +#define FSYNC_MEMCPY MEMCPY +#define FSYNC_MMAP MMAP +#define FSYNC_FSYNC(nvf,cpuid,close,fdsync) fsync_flush_on_fsync(nvf,cpuid,close,fdsync) + +void fsync_flush_on_fsync(struct NVFile* nvf, int cpuid, int close, int fdsync); + +#endif diff --git a/splitfs_syscall_intercept/src/handle_mmaps.c b/splitfs_syscall_intercept/src/handle_mmaps.c new file mode 100644 index 0000000000..fdc20858f4 --- /dev/null +++ b/splitfs_syscall_intercept/src/handle_mmaps.c @@ -0,0 +1,1136 @@ +/* + * ===================================================================================== + * + * Filename: handle_mmaps.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:39:26 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#include +#include +// #include "liblfds711/inc/liblfds711.h" +#include "handle_mmaps.h" +#include "bg_clear_mmap.h" +#include "timers.h" +#include "nvp_lock.h" +#include "fsync.h" + +void *intel_memcpy(void * __restrict__ b, const void * __restrict__ a, size_t n){ + char *s1 = b; + const char *s2 = a; + for(; 0dr_over_info.dr_fd; + _nvp_full_drs[full_dr_idx].start_addr = node->dr_over_info.start_addr; + _nvp_full_drs[full_dr_idx].size = DR_OVER_SIZE; + full_dr_idx++; + } else { + _nvp_full_drs[full_dr_idx].dr_fd = node->dr_info.dr_fd; + _nvp_full_drs[full_dr_idx].start_addr = node->dr_info.start_addr; + _nvp_full_drs[full_dr_idx].size = DR_SIZE; + full_dr_idx++; + } + + if (is_overwrite) + sprintf(dr_fname, "%s%s", NVMM_PATH, "DR-OVER-XXXXXX"); + else + sprintf(dr_fname, "%s%s", NVMM_PATH, "DR-XXXXXX"); + dr_fd = syscall_no_intercept(SYS_open, mktemp(dr_fname), O_RDWR | O_CREAT, 0666); + if (dr_fd < 0) { + MSG("%s: mkstemp of DR file failed. Err = %s\n", + __func__, strerror(-dr_fd)); + assert(0); + } + if (is_overwrite) + ret = posix_fallocate(dr_fd, 0, DR_OVER_SIZE); + else + ret = posix_fallocate(dr_fd, 0, DR_SIZE); + + if (ret < 0) { + MSG("%s: posix_fallocate failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + + syscall_no_intercept(SYS_fstat, dr_fd, &stat_buf); + + if (is_overwrite) { + node->dr_over_info.dr_fd = dr_fd; + node->dr_over_info.start_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + DR_OVER_SIZE, + PROT_READ | PROT_WRITE, //max_perms, + MAP_SHARED | MAP_POPULATE, + node->dr_over_info.dr_fd, //fd_with_max_perms, + 0 + ); + + if (node->dr_over_info.start_addr == 0) { + MSG("%s: mmap failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + node->dr_over_info.valid_offset = 0; + node->dr_over_info.dr_offset_start = 0; + node->dr_over_info.dr_offset_end = DR_OVER_SIZE; + node->dr_over_info.dr_serialno = stat_buf.st_ino; + + } else { + node->dr_info.dr_fd = dr_fd; + node->dr_info.start_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + DR_SIZE, + PROT_READ | PROT_WRITE, //max_perms, + MAP_SHARED | MAP_POPULATE, + node->dr_info.dr_fd, //fd_with_max_perms, + 0 + ); + + if (node->dr_info.start_addr == 0) { + MSG("%s: mmap failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + node->dr_info.valid_offset = 0; + node->dr_info.dr_offset_start = DR_SIZE; + node->dr_info.dr_offset_end = node->dr_info.valid_offset; + node->dr_info.dr_serialno = stat_buf.st_ino; + } + + DEBUG_FILE("%s: Unmapped and mapped DR file again\n", __func__); +} + +void change_dr_mmap(struct NVNode *node, int is_overwrite) { + struct free_dr_pool *temp_dr_mmap = NULL; + unsigned long offset_in_page = 0; + + DEBUG_FILE("%s: Throwing away DR File FD = %d\n", __func__, node->dr_info.dr_fd); + + if (is_overwrite) { + struct free_dr_pool *temp_dr_info = lfq_dequeue(&staging_over_mmap_queue_ctx); + if (temp_dr_info != 0) { + //if( lfds711_queue_umm_dequeue(&qs_over, &qe_over) ) { + // Found addr in global pool + //struct free_dr_pool *temp_dr_info = NULL; + //temp_dr_info = LFDS711_QUEUE_UMM_GET_VALUE_FROM_ELEMENT( *qe_over ); + node->dr_over_info.start_addr = temp_dr_info->start_addr; + node->dr_over_info.valid_offset = temp_dr_info->valid_offset; + node->dr_over_info.dr_offset_start = temp_dr_info->dr_offset_start; + node->dr_over_info.dr_fd = temp_dr_info->dr_fd; + node->dr_over_info.dr_serialno = temp_dr_info->dr_serialno; + node->dr_over_info.dr_offset_end = DR_OVER_SIZE; + DEBUG_FILE("%s: DR found in global pool. Got from global pool. FD = %d\n", + __func__, temp_dr_info->dr_fd); + } else { + DEBUG_FILE("%s: Global queue empty\n", __func__); + memset((void *)&node->dr_info, 0, sizeof(struct free_dr_pool)); + } + } else { + struct free_dr_pool *temp_dr_info = lfq_dequeue(&staging_mmap_queue_ctx); + if (temp_dr_info != 0) { + //if( lfds711_queue_umm_dequeue(&qs, &qe) ) { + // Found addr in global pool + //struct free_dr_pool *temp_dr_info = NULL; + //temp_dr_info = LFDS711_QUEUE_UMM_GET_VALUE_FROM_ELEMENT( *qe ); + node->dr_info.start_addr = temp_dr_info->start_addr; + node->dr_info.valid_offset = temp_dr_info->valid_offset; + node->dr_info.dr_offset_start = DR_SIZE; + node->dr_info.dr_fd = temp_dr_info->dr_fd; + node->dr_info.dr_serialno = temp_dr_info->dr_serialno; + node->dr_info.dr_offset_end = temp_dr_info->valid_offset; + DEBUG_FILE("%s: DR found in global pool. Got from global pool. FD = %d\n", + __func__, temp_dr_info->dr_fd); + } else { + DEBUG_FILE("%s: Global queue empty\n", __func__); + memset((void *)&node->dr_info, 0, sizeof(struct free_dr_pool)); + } + } + + __atomic_fetch_sub(&num_drs_left, 1, __ATOMIC_SEQ_CST); + + callBgCleaningThread(is_overwrite); +} + +void nvp_free_dr_mmaps() +{ + unsigned long addr; + unsigned long offset_in_page = 0; + struct free_dr_pool *temp_free_pool_of_dr_mmaps; + int i = 0; + ssize_t file_name_size = 0; + + while ((temp_free_pool_of_dr_mmaps = lfq_dequeue(&staging_mmap_queue_ctx)) != 0) { + //while( lfds711_queue_umm_dequeue(&qs, &qe) ) { + //temp_free_pool_of_dr_mmaps = LFDS711_QUEUE_UMM_GET_VALUE_FROM_ELEMENT( *qe ); + addr = temp_free_pool_of_dr_mmaps->start_addr; + munmap((void *)addr, DR_SIZE); + + // Fetch the name of the file before closing it. + char fd_str[256]; + char new_path[256]; + sprintf(fd_str, "/proc/self/fd/%d", temp_free_pool_of_dr_mmaps->dr_fd); + file_name_size = readlink(fd_str, new_path, sizeof(new_path)); + if (file_name_size == -1) + assert(0); + new_path[file_name_size] = '\0'; + + close(temp_free_pool_of_dr_mmaps->dr_fd); + + // Remove the file. + syscall_no_intercept(SYS_unlink, new_path); + __atomic_fetch_sub(&num_drs_left, 1, __ATOMIC_SEQ_CST); + } + //lfds711_queue_umm_cleanup( &qs, NULL ); + +#if DATA_JOURNALING_ENABLED + + while ((temp_free_pool_of_dr_mmaps = lfq_dequeue(&staging_over_mmap_queue_ctx)) != 0) { + //while( lfds711_queue_umm_dequeue(&qs_over, &qe_over) ) { + //temp_free_pool_of_dr_mmaps = LFDS711_QUEUE_UMM_GET_VALUE_FROM_ELEMENT( *qe_over ); + addr = temp_free_pool_of_dr_mmaps->start_addr; + munmap((void *)addr, DR_OVER_SIZE); + + // Fetch the name of the file before closing it. + char fd_str[256]; + char new_path[256]; + sprintf(fd_str, "/proc/self/fd/%d", temp_free_pool_of_dr_mmaps->dr_fd); + file_name_size = readlink(fd_str, new_path, sizeof(new_path)); + if (file_name_size == -1) + assert(0); + new_path[file_name_size] = '\0'; + + syscall_no_intercept(SYS_close, temp_free_pool_of_dr_mmaps->dr_fd); + + // Remove the file. + syscall_no_intercept(SYS_unlink, new_path); + __atomic_fetch_sub(&num_drs_left, 1, __ATOMIC_SEQ_CST); + } + // lfds711_queue_umm_cleanup( &qs_over, NULL ); + + for (i = 0; i < full_dr_idx; i++) { + addr = _nvp_full_drs[i].start_addr; + munmap((void *)addr, _nvp_full_drs[i].size); + syscall_no_intercept(SYS_close, _nvp_full_drs[i].dr_fd); + } + +#endif // DATA_JOURNALING_ENABLED + +} + +void nvp_reset_mappings(struct NVNode *node) +{ + int i, dirty_index; + + DEBUG("Cleanup: root 0x%x, height %u\n", root, height); + + if(node->root_dirty_num) { + // Check if many mmap()s need to be memset. If total_dirty_mmaps is set, that means all the mmap()s need to be copied + if(node->total_dirty_mmaps) { + memset((void *)node->root, 0, 1024 * sizeof(unsigned long)); + memset((void *)node->merkle_root, 0, 1024 * sizeof(unsigned long)); + } else { + // Only copy the dirty mmaps. The indexes can be found in the root_dirty_cache. + for(i = 0; i < node->root_dirty_num; i++) { + dirty_index = node->root_dirty_cache[i]; + if(node->root && node->root[dirty_index]) { + node->root[dirty_index] = 0; + node->merkle_root[dirty_index] = 0; + } + } + } + if(node->root_dirty_num) + memset((void *)node->root_dirty_cache, 0, 20 * sizeof(unsigned long)); + } + node->isRootSet = 0; + node->height = 0; + node->total_dirty_mmaps = 0; + node->root_dirty_num = 0; +} + +static unsigned int calculate_new_height(off_t offset) +{ + unsigned int height = 0; + off_t temp_offset = offset / ((unsigned long)1024 * MAX_MMAP_SIZE); + + while (temp_offset) { + temp_offset /= 1024; + height++; + } + + return height; +} + +int nvp_get_mmap_address(struct NVFile *nvf, + off_t offset, + size_t count, + unsigned long *mmap_addr, + unsigned long *bitmap_root, + off_t *offset_within_mmap, + size_t *extent_length, + int wr_lock, + int cpuid, + struct NVTable_maps *tbl_app, + struct NVTable_maps *tbl_over) +{ + int i; + int index; + unsigned int height = nvf->node->height; + unsigned int new_height; + unsigned long capacity = MAX_MMAP_SIZE; + unsigned long *root = nvf->node->root; + +#if !NON_TEMPORAL_WRITES + unsigned long *merkle_root = nvf->node->merkle_root; + unsigned long merkle_start_addr; +#endif + + unsigned long start_addr; + off_t start_offset = offset; + instrumentation_type nvnode_lock_time, file_mmap_time; + + DEBUG("Get mmap address: offset 0x%lx, height %u\n", offset, height); + DEBUG("root @ %p\n", root); + + do { + capacity = calculate_capacity(height); + index = start_offset / capacity; + + DEBUG("index %d\n", index); +#if !NON_TEMPORAL_WRITES + if (index >= 1024 || root[index] == 0 || merkle_root[index] == 0) { +#else + if (index >= 1024 || root[index] == 0) { +#endif + goto not_found; + } + if (height) { + root = (unsigned long *)root[index]; + +#if !NON_TEMPORAL_WRITES + merkle_root = (unsigned long *)merkle_root[index]; +#endif + + DEBUG("%p\n", root); + } else { + start_addr = root[index]; + +#if !NON_TEMPORAL_WRITES + merkle_start_addr = merkle_root[index]; +#endif + DEBUG("addr 0x%lx\n", start_addr); + } + start_offset = start_offset % capacity; + } while(height--); + //NVP_END_TIMING(lookup_t, lookup_time); + +#if !NON_TEMPORAL_WRITES + if (IS_ERR(start_addr) || start_addr == 0 || merkle_start_addr == 0) { +#else + if (IS_ERR(start_addr) || start_addr == 0) { +#endif + MSG("ERROR!\n"); + fflush(NULL); + assert(0); + } + + (*mmap_addr) = (start_addr + (offset % MAX_MMAP_SIZE)); + *offset_within_mmap = offset % MAX_MMAP_SIZE; + +#if !NON_TEMPORAL_WRITES + *bitmap_root = merkle_start_addr; +#endif + (*extent_length) = (MAX_MMAP_SIZE - (offset % MAX_MMAP_SIZE)); + + DEBUG("Found: mmap addr 0x%lx, extent length %lu\n", + *mmap_addr, *extent_length); + return 0; + +not_found: + DEBUG("Not found, perform mmap\n"); + + if (offset >= ALIGN_MMAP_DOWN(nvf->node->true_length)) { + DEBUG("File length smaller than offset: " + "length 0x%lx, offset 0x%lx\n", + nvf->node->length, offset); + return 1; + } + + if (!wr_lock) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + + NVP_UNLOCK_NODE_RD(nvf, cpuid); + START_TIMING(nvnode_lock_t, nvnode_lock_time); + NVP_LOCK_NODE_WR(nvf); + + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + + END_TIMING(nvnode_lock_t, nvnode_lock_time); + } + + start_offset = ALIGN_MMAP_DOWN(offset); + + if (start_offset + MAX_MMAP_SIZE > nvf->node->true_length) { + ERROR("File length smaller than offset: " + "length 0x%lx, offset 0x%lx\n", + nvf->node->length, offset); + MSG("%s: file length smaller than offset\n", __func__); + return 1; + } + + START_TIMING(file_mmap_t, file_mmap_time); + int max_perms = ((nvf->canRead) ? PROT_READ : 0) | + ((nvf->canWrite) ? PROT_WRITE : 0); + + start_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + MAX_MMAP_SIZE, + max_perms, //max_perms, + MAP_SHARED | MAP_POPULATE, +// MAP_SHARED, + nvf->fd, //fd_with_max_perms, + start_offset + //0 + ); + + END_TIMING(file_mmap_t, file_mmap_time); + + DEBUG("%s: created mapping of address = %lu, inode = %lu, thread id = %lu\n", __func__, start_addr, nvf->node->serialno, pthread_self()); + + /* Bitmap Tree creation */ +#if !NON_TEMPORAL_WRITES + createTree((struct merkleBtreeNode **)&merkle_start_addr); + if (IS_ERR(start_addr) || start_addr == 0 || merkle_start_addr == 0) { +#else + if (IS_ERR(start_addr) || start_addr == 0) { +#endif + MSG("mmap failed for fd %i: %s, mmap count %d, addr %lu, errno is %lu\n", + nvf->fd, strerror(errno), num_mmap, start_addr, errno); + MSG("Open count %d, close count %d\n", num_open, num_close); + MSG("Use posix operations for fd %i instead.\n", nvf->fd); + nvf->posix = 1; + fflush(NULL); + assert(0); + } + + DEBUG_FILE("%s: Performed mmap. Start_addr = %p, inode no = %lu\n", __func__, (void *) start_addr, nvf->node->serialno); + + num_mmap++; + + DEBUG("mmap offset 0x%lx, start_offset 0x%lx\n", offset, start_offset); + + height = nvf->node->height; + new_height = calculate_new_height(offset); + + if (height < new_height) { + MSG("Increase height from %u to %u\n", height, new_height); + + while (height < new_height) { + unsigned long old_root = (unsigned long)nvf->node->root; + nvf->node->root = malloc(1024 * sizeof(unsigned long)); + +#if !NON_TEMPORAL_WRITES + unsigned long old_merkle_root = (unsigned long)nvf->node->merkle_root; + nvf->node->merkle_root = malloc(1024 * sizeof(unsigned long)); + for (i = 0; i < 1024; i++) { + nvf->node->root[i] = 0; + nvf->node->merkle_root[i] = 0; + } + nvf->node->merkle_root[0] = (unsigned long)old_merkle_root; +#else + for (i = 0; i < 1024; i++) { + nvf->node->root[i] = 0; + } +#endif + DEBUG("Malloc new root @ %p\n", nvf->node->root); + nvf->node->root[0] = (unsigned long)old_root; + DEBUG("Old root 0x%lx\n", nvf->node->root[0]); + height++; + } + + nvf->node->height = new_height; + height = new_height; + } + + root = nvf->node->root; +#if !NON_TEMPORAL_WRITES + merkle_root = nvf->node->merkle_root; +#endif + do { + capacity = calculate_capacity(height); + index = start_offset / capacity; + DEBUG("index %d\n", index); + if (height) { + if (root[index] == 0) { + root[index] = (unsigned long)malloc(1024 * + sizeof(unsigned long)); + +#if !NON_TEMPORAL_WRITES + merkle_root[index] = (unsigned long)malloc(1024 * sizeof(unsigned long)); + root = (unsigned long *)root[index]; + merkle_root = (unsigned long *)merkle_root[index]; + for (i = 0; i < 1024; i++) { + root[i] = 0; + merkle_root[i] = 0; + } +#else + root = (unsigned long *)root[index]; + for (i = 0; i < 1024; i++) { + root[i] = 0; + } +#endif + } else { + root = (unsigned long *)root[index]; +#if !NON_TEMPORAL_WRITES + merkle_root = (unsigned long *)merkle_root[index]; +#endif + } + } else { + root[index] = start_addr; + nvf->node->root_dirty_cache[nvf->node->root_dirty_num] = index; + if(!nvf->node->total_dirty_mmaps) { + nvf->node->root_dirty_num++; + if(nvf->node->root_dirty_num == 20) + nvf->node->total_dirty_mmaps = 1; + } +#if !NON_TEMPORAL_WRITES + merkle_root[index] = merkle_start_addr; +#endif + } + start_offset = start_offset % capacity; + } while(height--); + + nvf->node->isRootSet = 1; + (*mmap_addr) = (start_addr + (offset % MAX_MMAP_SIZE)); + *offset_within_mmap = offset % MAX_MMAP_SIZE; + +#if !NON_TEMPORAL_WRITES + *bitmap_root = merkle_start_addr; +#endif + (*extent_length) = (MAX_MMAP_SIZE - (offset % MAX_MMAP_SIZE)); + + if (!wr_lock) { + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + + NVP_UNLOCK_NODE_WR(nvf); + NVP_LOCK_NODE_RD(nvf, cpuid); + + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + } + + DEBUG("mmap addr 0x%lx, extent length %lu\n", + *mmap_addr, *extent_length); + + return 0; +} + +#if DATA_JOURNALING_ENABLED + +static void nvp_manage_over_dr_memory(struct NVFile *nvf, uint64_t *extent_length, + uint64_t len_to_write, off_t start_offset, + int index) +{ + int i; + /* + * Check if the reads are being served from DR. If yes, then all the future reads should + * be performed through the file backed memory, for the appended and fsync()ed region. + */ + + DEBUG_FILE("%s START: dr_offset_start = %lu, dr_offset_end = %lu\n", + __func__, nvf->node->dr_over_info.dr_offset_start, nvf->node->dr_over_info.dr_offset_end); + if(*extent_length >= len_to_write) + nvf->node->dr_over_info.dr_offset_start = start_offset + len_to_write; +} + +#endif // DATA_JOURNALING_ENABLED + +static void nvp_manage_dr_memory(struct NVFile *nvf, uint64_t *extent_length, + uint64_t len_to_write, off_t start_offset, + int index) +{ + int i; + unsigned long offset_within_mmap = 0; + /* + * Check if the reads are being served from DR. If yes, then all the future reads should + * be performed through the file backed memory, for the appended and fsync()ed region. + */ + + offset_within_mmap = start_offset; + + DEBUG_FILE("%s START: dr_offset_start = %lu, dr_offset_end = %lu, offset_within_mmap = %lu\n", + __func__, nvf->node->dr_info.dr_offset_start, nvf->node->dr_info.dr_offset_end, + offset_within_mmap); + + if(nvf->node->dr_info.dr_offset_start > offset_within_mmap) + // Update the portion from which the dirty DR region starts. + nvf->node->dr_info.dr_offset_start = offset_within_mmap; + if(*extent_length > len_to_write) { + if(nvf->node->dr_info.dr_offset_end < (offset_within_mmap + len_to_write)) + // Update the portion till which the dirty DR region exists + nvf->node->dr_info.dr_offset_end = offset_within_mmap + len_to_write; + } else { + // It is a large write. So finish writing to this mmap. + if(nvf->node->dr_info.dr_offset_end < (offset_within_mmap + *extent_length)) + nvf->node->dr_info.dr_offset_end = DR_SIZE; + } + + DEBUG_FILE("%s END: dr_offset_start = %lu, dr_offset_end = %lu, offset_within_mmap = %lu\n", + __func__, nvf->node->dr_info.dr_offset_start, nvf->node->dr_info.dr_offset_end, + offset_within_mmap); + + if (nvf->node->dr_info.dr_offset_start < nvf->node->dr_info.valid_offset) + assert(0); + if (nvf->node->dr_info.valid_offset > DR_SIZE) + assert(0); + if (nvf->node->dr_info.dr_offset_start > DR_SIZE) + assert(0); + if (nvf->node->dr_info.dr_offset_end > DR_SIZE) + assert(0); + if (nvf->node->dr_info.dr_offset_end < nvf->node->dr_info.dr_offset_start) + assert(0); +} + +#if DATA_JOURNALING_ENABLED + +int nvp_get_over_dr_address(struct NVFile *nvf, + off_t offset, + size_t len_to_write, + unsigned long *mmap_addr, + off_t *offset_within_mmap, + size_t *extent_length, + int wr_lock, + int cpuid, + struct NVTable_maps *tbl_app, + struct NVTable_maps *tbl_over) +{ + int index; + unsigned long capacity = DR_OVER_SIZE; + unsigned long start_addr, unaligned_file_end; + off_t file_offset = offset, offset_within_page = 0; + off_t start_offset = 0; + struct stat stat_buf; + instrumentation_type nvnode_lock_time, dr_mem_queue_time; + + DEBUG("Get mmap address: offset 0x%lx, height %u\n", + offset, height); + /* The index of the mmap in the global DR pool. + * Max number of entries = 1024. + */ + if (nvf->node->dr_over_info.start_addr == 0) + goto not_found; + + /* Anonymous mmap at that index is present for the file. + * So get the start address. + */ + start_addr = nvf->node->dr_over_info.start_addr; + DEBUG("addr 0x%lx\n", start_addr); + // Get the offset in the mmap to which the memcpy must be performed. + if (IS_ERR(start_addr) || start_addr == 0) { + MSG("%s: ERROR!\n", __func__); + assert(0); + } + /* address we want to perform memcpy(). The start_offset + * is the offset with relation to node->true_length. + */ + start_offset = nvf->node->dr_over_info.dr_offset_start; + + DEBUG_FILE("%s: DR valid_offset = %lu. Start offset = %lu, true length = %lu\n", + __func__, nvf->node->dr_over_info.valid_offset, + start_offset, nvf->node->true_length); + + if ((start_offset % MMAP_PAGE_SIZE) != (file_offset % MMAP_PAGE_SIZE)) { + offset_within_page = start_offset % MMAP_PAGE_SIZE; + if (offset_within_page != 0) { + start_offset += MMAP_PAGE_SIZE - offset_within_page; + } + offset_within_page = file_offset % MMAP_PAGE_SIZE; + if (offset_within_page != 0) { + start_offset += offset_within_page; + } + } + + if (start_offset >= DR_OVER_SIZE) { + DEBUG_FILE("%s: start_offset = %lld, DR_OVER_SIZE = %lu, dr_offset_start = %lld\n", + __func__, start_offset, DR_OVER_SIZE, nvf->node->dr_over_info.dr_offset_start); + } + + if (nvf->node->dr_over_info.valid_offset > start_offset) + assert(0); + + *mmap_addr = start_addr + start_offset; + *offset_within_mmap = start_offset; + /* This gives how much free space is remaining in the + * current anonymous mmap. + */ + if (start_offset < DR_OVER_SIZE) + *extent_length = DR_OVER_SIZE - start_offset; + else + *extent_length = 0; + /* The mmap for that index was not found. Performing mmap + * in this section. + */ + if (!wr_lock) { + TBL_ENTRY_UNLOCK_WR(tbl_over); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + + START_TIMING(nvnode_lock_t, nvnode_lock_time); + + NVP_LOCK_NODE_WR(nvf); + TBL_ENTRY_LOCK_WR(tbl_app); + TBL_ENTRY_LOCK_WR(tbl_over); + + END_TIMING(nvnode_lock_t, nvnode_lock_time); + } + + nvp_manage_over_dr_memory(nvf, extent_length, len_to_write, + start_offset, index); + + if (nvf->node->dr_over_info.dr_offset_end != DR_OVER_SIZE) + assert(0); + + return 0; + +not_found: + /* The mmap for that index was not found. Performing mmap + * in this section. + */ + if (!wr_lock) { + + TBL_ENTRY_UNLOCK_WR(tbl_over); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + + START_TIMING(nvnode_lock_t, nvnode_lock_time); + + NVP_LOCK_NODE_WR(nvf); + TBL_ENTRY_LOCK_WR(tbl_app); + TBL_ENTRY_LOCK_WR(tbl_over); + + END_TIMING(nvnode_lock_t, nvnode_lock_time); + } + + START_TIMING(dr_mem_queue_t, dr_mem_queue_time); + + struct free_dr_pool *temp_dr_info = lfq_dequeue(&staging_over_mmap_queue_ctx); + if (temp_dr_info != 0) { + //if( lfds711_queue_umm_dequeue(&qs_over, &qe_over) ) { + // Found addr in global pool + //struct free_dr_pool *temp_dr_info = NULL; + unsigned long offset_in_page = 0; + //temp_dr_info = LFDS711_QUEUE_UMM_GET_VALUE_FROM_ELEMENT( *qe_over ); + nvf->node->dr_over_info.start_addr = temp_dr_info->start_addr; + nvf->node->dr_over_info.valid_offset = temp_dr_info->valid_offset; + nvf->node->dr_over_info.dr_fd = temp_dr_info->dr_fd; + nvf->node->dr_over_info.dr_serialno = temp_dr_info->dr_serialno; + nvf->node->dr_over_info.dr_offset_start = temp_dr_info->dr_offset_start; + nvf->node->dr_over_info.dr_offset_end = DR_OVER_SIZE; + __atomic_fetch_sub(&num_drs_left, 1, __ATOMIC_SEQ_CST); + } else { + DEBUG_FILE("%s: Allocating new DR\n", __func__); + // Nothing in global pool + int dr_fd = 0; + int i = 0; + char dr_fname[256]; + unsigned long offset_in_page = 0; + int num_blocks = DR_OVER_SIZE / MMAP_PAGE_SIZE; + int max_perms = ((nvf->canRead) ? PROT_READ : 0) | + ((nvf->canWrite) ? PROT_WRITE : 0); + DEBUG_FILE("%s: DR not found in global pool. Allocated dr_file variable\n", __func__); + + sprintf(dr_fname, "%s%s", NVMM_PATH, "DR-OVER-XXXXXX"); + dr_fd = syscall_no_intercept(SYS_open, mktemp(dr_fname), O_RDWR | O_CREAT, 0666); + if (dr_fd < 0) { + MSG("%s: mkstemp of DR file failed. Err = %s\n", + __func__, strerror(-dr_fd)); + assert(0); + } + posix_fallocate(dr_fd, 0, DR_SIZE); + num_mmap++; + num_drs++; + num_drs_critical_path++; + nvf->node->dr_over_info.start_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + DR_OVER_SIZE, + max_perms, //max_perms, + MAP_SHARED | MAP_POPULATE, + dr_fd, //fd_with_max_perms, + 0 + ); + + DEBUG_FILE("%s: Setting offset_start to DR_SIZE. FD = %d\n", + __func__, nvf->fd); + syscall_no_intercept(SYS_fstat, dr_fd, &stat_buf); + nvf->node->dr_over_info.dr_serialno = stat_buf.st_ino; + nvf->node->dr_over_info.dr_fd = dr_fd; + nvf->node->dr_over_info.valid_offset = 0; + nvf->node->dr_over_info.dr_offset_start = 0; + nvf->node->dr_over_info.dr_offset_end = DR_OVER_SIZE; + dr_fname[0] = '\0'; + DEBUG_FILE("%s: DR not found in global pool. Initialized DR_INFO. FD = %d\n", __func__, dr_fd); + } + start_addr = nvf->node->dr_over_info.start_addr; + __atomic_fetch_add(&dr_mem_allocated, DR_OVER_SIZE, + __ATOMIC_SEQ_CST); + nvf->node->dr_mem_used += DR_OVER_SIZE; + + END_TIMING(dr_mem_queue_t, dr_mem_queue_time); + if (IS_ERR(start_addr) || start_addr == 0) + { + MSG("mmap failed for %s, mmap count %d, addr %lu, errno is %lu\n", + strerror(errno), num_mmap, start_addr, errno); + MSG("Open count %d, close count %d\n", + num_open, num_close); + nvf->posix = 1; + assert(0); + } + /* Get the index of the mmap from the size of mmap and + * from the offset. + */ + DEBUG_FILE("%s: offset requested = %lu\n", __func__, offset); + start_offset = nvf->node->dr_over_info.dr_offset_start; + offset_within_page = start_offset % MMAP_PAGE_SIZE; + if (offset_within_page != 0) { + start_offset += MMAP_PAGE_SIZE - offset_within_page; + } + offset_within_page = file_offset % MMAP_PAGE_SIZE; + if (offset_within_page != 0) { + start_offset += offset_within_page; + } + + if ((start_offset % MMAP_PAGE_SIZE) != (file_offset % MMAP_PAGE_SIZE)) + assert(0); + + if (start_offset >= DR_OVER_SIZE) { + DEBUG_FILE("%s: start_offset = %lld, DR_OVER_SIZE = %lu, dr_offset_start = %lld\n", + __func__, start_offset, DR_OVER_SIZE, nvf->node->dr_over_info.dr_offset_start); + } + + if (nvf->node->dr_over_info.valid_offset > start_offset) + assert(0); + + *mmap_addr = start_addr + start_offset; + *offset_within_mmap = start_offset; + + if (start_offset < DR_OVER_SIZE) + *extent_length = DR_OVER_SIZE - start_offset; + else + *extent_length = 0; + + DEBUG_FILE("%s: Will do manage DR memory if it is a write\n", + __func__); + + nvp_manage_over_dr_memory(nvf, extent_length, + len_to_write, start_offset, index); + + if (nvf->node->dr_over_info.dr_offset_end != DR_OVER_SIZE) + assert(0); + + return 0; +} + +#endif // DATA_JOURNALING_ENABLED + +int nvp_get_dr_mmap_address(struct NVFile *nvf, off_t offset, + size_t len_to_write, size_t count, + unsigned long *mmap_addr, + off_t *offset_within_mmap, + size_t *extent_length, int wr_lock, + int cpuid, int iswrite, + struct NVTable_maps *tbl_app, + struct NVTable_maps *tbl_over) +{ + int index; + unsigned long capacity = DR_SIZE; + unsigned long start_addr, unaligned_file_end; + off_t start_offset = offset; + struct stat stat_buf; + instrumentation_type nvnode_lock_time, dr_mem_queue_time; + + DEBUG("Get mmap address: offset 0x%lx, height %u\n", + offset, height); + /* The index of the mmap in the global DR pool. + * Max number of entries = 1024. + */ + if (nvf->node->dr_info.start_addr == 0) { + if(iswrite) + /* Have to get the mmap from the + * global anonymous pool. + */ + goto not_found; + else { + /* If it is a read, then the anonymous mmap + * must be found. Otherwise something is wrong. + */ + ERROR("dr mmap not found\n"); + MSG("%s: dr mmap not found\n", __func__); + assert(0); + } + } + /* Anonymous mmap at that index is present for the file. + * So get the start address. + */ + start_addr = nvf->node->dr_info.start_addr; + DEBUG("addr 0x%lx\n", start_addr); + // Get the offset in the mmap to which the memcpy must be performed. + if (IS_ERR(start_addr) || start_addr == 0) { + MSG("%s: ERROR!\n", __func__); + assert(0); + } + /* address we want to perform memcpy(). The start_offset + * is the offset with relation to node->true_length. + */ + DEBUG_FILE("%s: DR valid_offset = %lu. Start offset = %lu, true length = %lu\n", + __func__, nvf->node->dr_info.valid_offset, + start_offset, nvf->node->true_length); + start_offset = (start_offset + + nvf->node->dr_info.valid_offset); + *mmap_addr = start_addr + start_offset; + *offset_within_mmap = start_offset; + /* This gives how much free space is remaining in the + * current anonymous mmap. + */ + *extent_length = DR_SIZE - start_offset; + /* The mmap for that index was not found. Performing mmap + * in this section. + */ + + if (!wr_lock) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + + START_TIMING(nvnode_lock_t, nvnode_lock_time); + NVP_LOCK_NODE_WR(nvf); + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + END_TIMING(nvnode_lock_t, nvnode_lock_time); + } + if(iswrite) { + nvp_manage_dr_memory(nvf, extent_length, len_to_write, + start_offset, index); + } + + if (!wr_lock && !iswrite) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_WR(nvf); + + NVP_LOCK_NODE_RD(nvf, cpuid); + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + } + + return 0; + +not_found: + /* The mmap for that index was not found. Performing mmap + * in this section. + */ + if (!wr_lock) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + + START_TIMING(nvnode_lock_t, nvnode_lock_time); + NVP_LOCK_NODE_WR(nvf); + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + END_TIMING(nvnode_lock_t, nvnode_lock_time); + } + + START_TIMING(dr_mem_queue_t, dr_mem_queue_time); + + struct free_dr_pool *temp_dr_info = lfq_dequeue(&staging_mmap_queue_ctx); + if (temp_dr_info != 0) { + //if( lfds711_queue_umm_dequeue(&qs, &qe) ) { + // Found addr in global pool + //struct free_dr_pool *temp_dr_info = NULL; + unsigned long offset_in_page = 0; + //temp_dr_info = LFDS711_QUEUE_UMM_GET_VALUE_FROM_ELEMENT( *qe ); + nvf->node->dr_info.start_addr = temp_dr_info->start_addr; + nvf->node->dr_info.valid_offset = temp_dr_info->valid_offset; + nvf->node->dr_info.dr_offset_start = DR_SIZE; + nvf->node->dr_info.dr_fd = temp_dr_info->dr_fd; + nvf->node->dr_info.dr_serialno = temp_dr_info->dr_serialno; + + if (nvf->node->dr_info.valid_offset < DR_SIZE) { + offset_in_page = nvf->node->true_length % MMAP_PAGE_SIZE; + nvf->node->dr_info.valid_offset += offset_in_page; + } + + nvf->node->dr_info.dr_offset_end = nvf->node->dr_info.valid_offset; + __atomic_fetch_sub(&num_drs_left, 1, __ATOMIC_SEQ_CST); + + DEBUG_FILE("%s: staging inode = %lu. Got from global pool with valid offset = %lld\n", + __func__, nvf->node->dr_info.dr_serialno, nvf->node->dr_info.valid_offset); + + } else { + DEBUG_FILE("%s: Allocating new DR\n", __func__); + // Nothing in global pool + int dr_fd = 0; + int i = 0; + char dr_fname[256]; + unsigned long offset_in_page = 0; + int num_blocks = DR_SIZE / MMAP_PAGE_SIZE; + int max_perms = ((nvf->canRead) ? PROT_READ : 0) | + ((nvf->canWrite) ? PROT_WRITE : 0); + DEBUG_FILE("%s: DR not found in global pool. Allocated dr_file variable\n", __func__); + + sprintf(dr_fname, "%s%s", NVMM_PATH, "DR-XXXXXX"); + dr_fd = syscall_no_intercept(SYS_open, mktemp(dr_fname), O_RDWR | O_CREAT, 0666); + if (dr_fd < 0) { + MSG("%s: mkstemp of DR file failed. Err = %s\n", + __func__, strerror(-dr_fd)); + assert(0); + } + posix_fallocate(dr_fd, 0, DR_SIZE); + num_mmap++; + num_drs++; + num_drs_critical_path++; + nvf->node->dr_info.start_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + DR_SIZE, + max_perms, //max_perms, + MAP_SHARED | MAP_POPULATE, + dr_fd, //fd_with_max_perms, + 0 + ); + + DEBUG_FILE("%s: Setting offset_start to DR_SIZE. FD = %d\n", + __func__, nvf->fd); + fstat(dr_fd, &stat_buf); + nvf->node->dr_info.dr_serialno = stat_buf.st_ino; + nvf->node->dr_info.dr_fd = dr_fd; + nvf->node->dr_info.valid_offset = 0; + nvf->node->dr_info.dr_offset_start = DR_SIZE; + offset_in_page = nvf->node->true_length % MMAP_PAGE_SIZE; + if (offset_in_page != 0) + nvf->node->dr_info.valid_offset += offset_in_page; + nvf->node->dr_info.dr_offset_end = nvf->node->dr_info.valid_offset; + dr_fname[0] = '\0'; + DEBUG_FILE("%s: DR not found in global pool. Initialized DR_INFO. FD = %d\n", __func__, dr_fd); + } + start_addr = nvf->node->dr_info.start_addr; + __atomic_fetch_add(&dr_mem_allocated, DR_SIZE, + __ATOMIC_SEQ_CST); + nvf->node->dr_mem_used += DR_SIZE; + + END_TIMING(dr_mem_queue_t, dr_mem_queue_time); + if (IS_ERR(start_addr) || start_addr == 0) + { + MSG("mmap failed for %s, mmap count %d, addr %lu, errno is %lu\n", + strerror(errno), num_mmap, start_addr, errno); + MSG("Open count %d, close count %d\n", + num_open, num_close); + nvf->posix = 1; + assert(0); + } + /* Get the index of the mmap from the size of mmap and + * from the offset. + */ + DEBUG_FILE("%s: offset requested = %lu\n", __func__, offset); + start_offset = (start_offset + + nvf->node->dr_info.valid_offset); + *mmap_addr = start_addr + start_offset; + *offset_within_mmap = start_offset; + *extent_length = DR_SIZE - start_offset; + + DEBUG_FILE("%s: Will do manage DR memory if it is a write\n", + __func__); + if(iswrite) + nvp_manage_dr_memory(nvf, extent_length, + len_to_write, start_offset, index); + + if (!wr_lock && !iswrite) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_WR(nvf); + + NVP_LOCK_NODE_RD(nvf, cpuid); + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + } + + return 0; +} diff --git a/splitfs_syscall_intercept/src/handle_mmaps.h b/splitfs_syscall_intercept/src/handle_mmaps.h new file mode 100644 index 0000000000..2ddd141d6f --- /dev/null +++ b/splitfs_syscall_intercept/src/handle_mmaps.h @@ -0,0 +1,117 @@ +/* + * ===================================================================================== + * + * Filename: handle_mmaps.h + * + * Description: + * + * Version: 1.0 + * Created: 09/26/2019 01:11:04 AM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#ifndef SPLITFS_HANDLE_MMAPS_H +#define SPLITFS_HANDLE_MMAPS_H + +#include +#include +#include "file.h" +#include "tbl_mmaps.h" +#include "non_temporal.h" + +#define MAP_SIZE 16 + +#if MAP_SIZE == 512 +#define MAX_MMAP_SIZE 536870912 +#elif MAP_SIZE == 256 +#define MAX_MMAP_SIZE 268435456 +#elif MAP_SIZE == 128 +#define MAX_MMAP_SIZE 134217728 +#elif MAP_SIZE == 64 +#define MAX_MMAP_SIZE 67108864 +#elif MAP_SIZE == 32 +#define MAX_MMAP_SIZE 33554432 +#elif MAP_SIZE == 16 +#define MAX_MMAP_SIZE 16777216 +#elif MAP_SIZE == 8 +#define MAX_MMAP_SIZE 8388608 +#elif MAP_SIZE == 4 +#define MAX_MMAP_SIZE 4194304 +#elif MAP_SIZE == 2 +#define MAX_MMAP_SIZE 2097152 +#else +#define MAX_MMAP_SIZE 536870912 +#endif + +#define ANON_MAP_SIZE 16 + +#if ANON_MAP_SIZE == 512 +#define ANON_MAX_MMAP_SIZE 536870912 +#elif ANON_MAP_SIZE == 256 +#define ANON_MAX_MMAP_SIZE 268435456 +#elif ANON_MAP_SIZE == 128 +#define ANON_MAX_MMAP_SIZE 134217728 +#elif ANON_MAP_SIZE == 64 +#define ANON_MAX_MMAP_SIZE 67108864 +#elif ANON_MAP_SIZE == 32 +#define ANON_MAX_MMAP_SIZE 33554432 +#elif ANON_MAP_SIZE == 16 +#define ANON_MAX_MMAP_SIZE 16777216 +#elif ANON_MAP_SIZE == 8 +#define ANON_MAX_MMAP_SIZE 8388608 +#elif ANON_MAP_SIZE == 4 +#define ANON_MAX_MMAP_SIZE 4194304 +#elif ANON_MAP_SIZE == 2 +#define ANON_MAX_MMAP_SIZE 2097152 +#else +#define ANON_MAX_MMAP_SIZE 536870912 +#endif + +int MMAP_PAGE_SIZE; +int MMAP_HUGEPAGE_SIZE; +#define PER_NODE_MAPPINGS 10 + +#define ALIGN_MMAP_DOWN(addr) ((addr) & ~(MAX_MMAP_SIZE - 1)) + +void *intel_memcpy(void * __restrict__ b, const void * __restrict__ a, size_t n); + +#define MEMCPY intel_memcpy +#define MEMCPY_NON_TEMPORAL memmove_nodrain_movnt_granularity +#define MMAP mmap + +extern atomic_uint_fast64_t dr_mem_allocated; + +void create_dr_mmap(struct NVNode *node, int is_overwrite); +void change_dr_mmap(struct NVNode *node, int is_overwrite); +void nvp_free_dr_mmaps(); +void nvp_reset_mappings(struct NVNode *node); +int nvp_get_over_dr_address(struct NVFile *nvf, + off_t offset, + size_t len_to_write, + unsigned long *mmap_addr, + off_t *offset_within_mmap, + size_t *extent_length, + int wr_lock, + int cpuid, + struct NVTable_maps *tbl_app, + struct NVTable_maps *tbl_over); +int nvp_get_mmap_address(struct NVFile *nvf, + off_t offset, + size_t count, + unsigned long *mmap_addr, + unsigned long *bitmap_root, + off_t *offset_within_mmap, + size_t *extent_length, + int wr_lock, + int cpuid, + struct NVTable_maps *tbl_app, + struct NVTable_maps *tbl_over); +int nvp_get_dr_mmap_address(struct NVFile *nvf, off_t offset, size_t len_to_write, size_t count, unsigned long *mmap_addr, off_t *offset_within_mmap, size_t *extent_length, int wr_lock, int cpuid, int iswrite, struct NVTable_maps *tbl_app, struct NVTable_maps *tbl_over); +void nvp_free_btree(unsigned long *root, unsigned long *merkle_root, unsigned long height, unsigned long *dirty_cache, int root_dirty_num, int total_dirty_mmaps); + +#endif diff --git a/splitfs_syscall_intercept/src/inode.h b/splitfs_syscall_intercept/src/inode.h new file mode 100644 index 0000000000..448c54294f --- /dev/null +++ b/splitfs_syscall_intercept/src/inode.h @@ -0,0 +1,79 @@ +/* + * ===================================================================================== + * + * Filename: inode.h + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:15:18 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ + +#ifndef SPLITFS_INODE_H +#define SPLITFS_INODE_H + +#include +#include "nvp_lock.h" +#include "staging.h" + +struct NVNode +{ + ino_t serialno; + ino_t backup_serialno; + NVP_LOCK_DECL; + + unsigned long true_length; + volatile size_t length; + volatile size_t maplength; + unsigned long *root; + unsigned long *merkle_root; + int free_list_idx; + int async_file_close; + unsigned long *root_dirty_cache; + int root_dirty_num; + int total_dirty_mmaps; + unsigned int height; + volatile int reference; + int isRootSet; + int index_in_free_list; + int is_large_file; + + // DR stuff + struct free_dr_pool dr_info; + struct free_dr_pool dr_over_info; + uint64_t dr_mem_used; +}; + +void nvp_transfer_to_free_dr_pool(struct NVNode*); + +#define NUM_NODE_LISTS 1 +#define NODE_LOCKING 1 +#define LARGE_FILE_THRESHOLD (300*1024*1024) + +#if NODE_LOCKING + +#define NVP_LOCK_NODE_RD(nvf, cpuid) NVP_LOCK_RD(nvf->node->lock, cpuid) +#define NVP_UNLOCK_NODE_RD(nvf, cpuid) NVP_LOCK_UNLOCK_RD(nvf->node->lock, cpuid) +#define NVP_LOCK_NODE_WR(nvf) NVP_LOCK_WR( nvf->node->lock) +#define NVP_UNLOCK_NODE_WR(nvf) NVP_LOCK_UNLOCK_WR(nvf->node->lock) + +#else + +#define NVP_LOCK_NODE_RD(nvf, cpuid) {(void)(cpuid);} +#define NVP_UNLOCK_NODE_RD(nvf, cpuid) {(void)(cpuid);} +#define NVP_LOCK_NODE_WR(nvf) {(void)(nvf->node->lock);} +#define NVP_UNLOCK_NODE_WR(nvf) {(void)(nvf->node->lock);} + +#endif + +extern int _nvp_ino_lookup[1024]; +extern pthread_spinlock_t node_lookup_lock[NUM_NODE_LISTS]; + +#endif diff --git a/splitfs_syscall_intercept/src/lfq.c b/splitfs_syscall_intercept/src/lfq.c new file mode 100644 index 0000000000..d2b6ca2da9 --- /dev/null +++ b/splitfs_syscall_intercept/src/lfq.c @@ -0,0 +1,242 @@ +#include "cross-platform.h" +#include "lfq.h" +#ifdef DEBUG +#include +#endif +#include +#define MAXFREE 150 + +static +int inHP(struct lfq_ctx *ctx, struct lfq_node * lfn) { + for ( int i = 0 ; i < ctx->MAXHPSIZE ; i++ ) { + //lmb(); // not needed, we don't care if loads reorder here, just that we check all the elements + if (ctx->HP[i] == lfn) + return 1; + } + return 0; +} + +static +void enpool(struct lfq_ctx *ctx, struct lfq_node * lfn) { + // add to tail of the free list + lfn->free_next = NULL; + volatile struct lfq_node *old_tail = XCHG(&ctx->fpt, lfn); // seq_cst + old_tail->free_next = lfn; + + // getting nodes out of this will have exactly the same deallocation problem + // as the main queue. + // TODO: a stack might be easier to manage, but would increase contention. + +/* + volatile struct lfq_node * p; + do { + p = ctx->fpt; + } while(!CAS(&ctx->fpt, p, lfn)); // exchange using CAS + p->free_next = lfn; +*/ +} + +static +void free_pool(struct lfq_ctx *ctx, bool freeall ) { + if (!CAS(&ctx->is_freeing, 0, 1)) + return; // this pool free is not support multithreading. + volatile struct lfq_node * p; + + for ( int i = 0 ; i < MAXFREE || freeall ; i++ ) { + p = ctx->fph; + if ( (!p->can_free) || (!p->free_next) || inHP(ctx, (struct lfq_node *)p) ) + goto exit; + ctx->fph = p->free_next; + free((void *)p); + } +exit: + ctx->is_freeing = false; + smb(); +} + +static +void safe_free(struct lfq_ctx *ctx, struct lfq_node * lfn) { + if (lfn->can_free && !inHP(ctx,lfn)) { + // free is not thread-safe + if (CAS(&ctx->is_freeing, 0, 1)) { + lfn->next = (void*)-1; // poison the pointer to detect use-after-free + free(lfn); // we got the lock; actually free + ctx->is_freeing = false; + smb(); + } else // we didn't get the lock; only add to a freelist + enpool(ctx, lfn); + } else + enpool(ctx, lfn); + free_pool(ctx, false); +} + +static +int alloc_tid(struct lfq_ctx *ctx) { + for (int i = 0; i < ctx->MAXHPSIZE; i++) + if (ctx->tid_map[i] == 0) + if (CAS(&ctx->tid_map[i], 0, 1)) + return i; + + return -1; +} + +static +void free_tid(struct lfq_ctx *ctx, int tid) { + ctx->tid_map[tid]=0; +} + +int lfq_init(struct lfq_ctx *ctx, int max_consume_thread) { + struct lfq_node * tmpnode = calloc(1,sizeof(struct lfq_node)); + if (!tmpnode) + return -errno; + + struct lfq_node * free_pool_node = calloc(1,sizeof(struct lfq_node)); + if (!free_pool_node) + return -errno; + + tmpnode->can_free = free_pool_node->can_free = true; + memset(ctx, 0, sizeof(struct lfq_ctx)); + ctx->MAXHPSIZE = max_consume_thread; + ctx->HP = calloc(max_consume_thread,sizeof(struct lfq_node)); + ctx->tid_map = calloc(max_consume_thread,sizeof(struct lfq_node)); + ctx->head = ctx->tail=tmpnode; + ctx->fph = ctx->fpt=free_pool_node; + + return 0; +} + + +long lfg_count_freelist(const struct lfq_ctx *ctx) { + long count=0; + struct lfq_node *p = (struct lfq_node *)ctx->fph; // non-volatile + while(p) { + count++; + p = p->free_next; + } + + return count; +} + +int lfq_clean(struct lfq_ctx *ctx){ + if ( ctx->tail && ctx->head ) { // if have data in queue + struct lfq_node *tmp; + while ( (struct lfq_node *) ctx->head ) { // while still have node + tmp = (struct lfq_node *) ctx->head->next; + safe_free(ctx, (struct lfq_node *)ctx->head); + ctx->head = tmp; + } + ctx->tail = 0; + } + if ( ctx->fph && ctx->fpt ) { + free_pool(ctx, true); + if ( ctx->fph != ctx->fpt ) + return -1; + free((void *)ctx->fpt); // free the empty node + ctx->fph=ctx->fpt=0; + } + if ( !ctx->fph && !ctx->fpt ) { + free((void *)ctx->HP); + free((void *)ctx->tid_map); + memset(ctx,0,sizeof(struct lfq_ctx)); + } else + return -1; + + return 0; +} + +int lfq_enqueue(struct lfq_ctx *ctx, void * data) { + struct lfq_node * insert_node = calloc(1,sizeof(struct lfq_node)); + if (!insert_node) + return -errno; + insert_node->data=data; +// mb(); // we've only written to "private" memory that other threads can't see. + volatile struct lfq_node *old_tail; +#if 0 + do { + old_tail = (struct lfq_node *) ctx->tail; + } while(!CAS(&ctx->tail,old_tail,insert_node)); +#else + old_tail = XCHG(&ctx->tail, insert_node); +#endif + // We've claimed our spot in the insertion order by modifying tail + // we are the only inserting thread with a pointer to the old tail. + + // now we can make it part of the list by overwriting the NULL pointer in the old tail + // This is safe whether or not other threads have updated ->next in our insert_node +#ifdef DEBUG + assert(!(old_tail->next) && "old tail wasn't NULL"); +#endif + old_tail->next = insert_node; + // TODO: could a consumer thread could have freed the old tail? no because that would leave head=NULL + +// ATOMIC_ADD( &ctx->count, 1); + return 0; +} + +void * lfq_dequeue_tid(struct lfq_ctx *ctx, int tid ) { + //int cn_runtimes = 0; + volatile struct lfq_node *old_head, *new_head; +#if 1 // HP[tid] stuff is necessary for deallocation. (but it's still not safe). + do { + retry: // continue jumps to the bottom of the loop, and would attempt a CAS with uninitialized new_head + old_head = ctx->head; + ctx->HP[tid] = old_head; // seq-cst store. (better: use xchg instead of mov + mfence on x86) + mb(); + + if (old_head != ctx->head) // another thread freed it before seeing our HP[tid] store + goto retry; + new_head = old_head->next; // FIXME: crash with old_head=NULL during deallocation (tid=5)? (main thread=25486, this=25489) + if (new_head==0 /* || new_head != old_head->next*/ ){ // redoing the same load isn't useful + ctx->HP[tid] = 0; + return 0; // never remove the last node + } +#ifdef DEBUG + assert(new_head != (void*)-1 && "read an already-freed node"); +#endif + } while( ! CAS(&ctx->head, old_head, new_head) ); +#else // without HP[] stuff + do { + old_head = ctx->head; + //ctx->HP[tid] = old_head; + new_head = old_head->next; + //if (old_head != ctx->head) continue; + if (!new_head) { + // ctx->HP[tid] = 0; + return 0; // never remove the last node + } +#ifdef DEBUG + assert(new_head != (void*)-1 && "read an already-freed node"); +#endif + } while( !CAS(&ctx->head, old_head, new_head) ); +#endif +// mb(); // CAS is already a memory barrier, at least on x86. + + // we've atomically advanced head, and we're the thread that won the race to claim a node + // We return the data from the *new* head. + // The list starts off with a dummy node, so the current head is always a node that's already been read. + + ctx->HP[tid] = 0; + void *ret = new_head->data; + new_head->can_free = true; +// ATOMIC_SUB( &ctx->count, 1 ); + + //old_head->next = (void*)-1; // done in safe-free in the actual free() path. poison the pointer to detect use-after-free + + // we need to avoid freeing until other readers are definitely not going to load its ->next in the CAS loop + safe_free(ctx, (struct lfq_node *)old_head); + + //free(old_head); + return ret; +} + +void * lfq_dequeue(struct lfq_ctx *ctx ) { + //return lfq_dequeue_tid(ctx, 0); // TODO: let this inline even in the shared library +// old version + int tid = alloc_tid(ctx); + if (tid==-1) + return (void *)-1; // To many thread race + + void * ret = lfq_dequeue_tid(ctx, tid); + free_tid(ctx, tid); + return ret; +} diff --git a/splitfs_syscall_intercept/src/lfq.h b/splitfs_syscall_intercept/src/lfq.h new file mode 100644 index 0000000000..58a9e44379 --- /dev/null +++ b/splitfs_syscall_intercept/src/lfq.h @@ -0,0 +1,52 @@ +#ifndef __LFQ_H__ +#define __LFQ_H__ +#include "cross-platform.h" + +#include // C11 + +struct lfq_node{ + void * data; + union { + struct lfq_node * volatile next; + struct lfq_node * volatile free_next; + }; + volatile int can_free; +}; + +struct lfq_ctx{ + alignas(64) volatile struct lfq_node * volatile head; + int volatile count; + volatile struct lfq_node * * HP; + volatile int * tid_map; + int volatile is_freeing; + volatile struct lfq_node * volatile fph; // free pool head + volatile struct lfq_node * volatile fpt; // free pool tail + int MAXHPSIZE; + + alignas(64) volatile struct lfq_node * volatile tail; // in another cache line to avoid contention +}; + +int lfq_init(struct lfq_ctx *ctx, int max_consume_thread); +int lfq_clean(struct lfq_ctx *ctx); +long lfg_count_freelist(const struct lfq_ctx *ctx); + +int lfq_enqueue(struct lfq_ctx *ctx, void * data); +void * lfq_dequeue_tid(struct lfq_ctx *ctx, int tid ); +void * lfq_dequeue(struct lfq_ctx *ctx ); + +/********************************************************** + * + * This macro will dequeue forever. + * If you do not like high cost cpu, + * use original dequeue function with memory barrier, + * and sleep/thread_yield will be better idea. + * + *********************************************************/ +#define LFQ_MB_DEQUEUE(ctx, ret) ({ \ + do { \ + ret = lfq_dequeue(ctx); \ + mb(); \ + } while(ret == 0); \ +}) + +#endif diff --git a/splitfs_syscall_intercept/src/link.c b/splitfs_syscall_intercept/src/link.c new file mode 100644 index 0000000000..c02b6ffc12 --- /dev/null +++ b/splitfs_syscall_intercept/src/link.c @@ -0,0 +1,120 @@ +#include +#include + +#include "timers.h" +#include "log.h" + +RETT_SYSCALL_INTERCEPT _sfs_LINK(INTF_SYSCALL) +{ + DEBUG_FILE("CALL: %s\n", __func__); + char *path1, *path2; + + path1 = (char *)arg0; + path2 = (char *)arg1; + + *result = syscall_no_intercept(SYS_link, path1, path2); + instrumentation_type op_log_entry_time; + // Write to op log + +#if !POSIX_ENABLED + if(*result == 0) { + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_LINK, + path1, + path2, + 0, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); + } +#endif + return RETT_NO_PASS_KERN; +} + +RETT_SYSCALL_INTERCEPT _sfs_SYMLINK(INTF_SYSCALL) +{ + DEBUG_FILE("CALL: %s\n", __func__); + + char *path1, *path2; + + path1 = (char *)arg0; + path2 = (char *)arg1; + + *result = syscall_no_intercept(SYS_symlink, path1, path2); + instrumentation_type op_log_entry_time; + // Write to op log + +#if !POSIX_ENABLED + if(*result == 0) { + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_SYMLINK, + path1, + path2, + 0, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); + } +#endif + return RETT_NO_PASS_KERN; +} + +RETT_SYSCALL_INTERCEPT _sfs_SYMLINKAT(INTF_SYSCALL) +{ + DEBUG_FILE("CALL: %s\n", __func__); + instrumentation_type op_log_entry_time; + char *old_path, *new_path; + int newdirfd; + + old_path = (char *)arg0; + newdirfd = (int)arg1; + new_path = (char *)arg2; + + *result = syscall_no_intercept(SYS_symlinkat, old_path, newdirfd, new_path); + // Write to op log + +#if !POSIX_ENABLED + char path[256]; + int path_len = 0; + if (newdirfd == AT_FDCWD) { + if (new_path[0] != '/') { + if (getcwd(path, sizeof(path)) == NULL) + assert(0); + + path_len = strlen(path); + path[path_len] = '/'; + path[path_len+1] = '\0'; + + if (strcat(path, new_path) == NULL) + assert(0); + } else { + if (strcpy(path, new_path) == NULL) + assert(0); + } + } else { + char fd_str[256]; + if (new_path[0] != '/') { + sprintf(fd_str, "/proc/self/fd/%d", newdirfd); + if (readlink(fd_str, path, sizeof(path)) < 0) + assert(0); + + path_len = strlen(path); + path[path_len] = '/'; + path[path_len+1] = '\0'; + if (strcat(path, new_path) == NULL) + assert(0); + + } else { + if (strcpy(path, new_path) == NULL) + assert(0); + } + } + + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_SYMLINK, + old_path, + path, + 0, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); +#endif + return RETT_NO_PASS_KERN; +} \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/log.c b/splitfs_syscall_intercept/src/log.c new file mode 100644 index 0000000000..7b4b8945ec --- /dev/null +++ b/splitfs_syscall_intercept/src/log.c @@ -0,0 +1,783 @@ +// Header file for LEDGER Logging +#include +#include "log.h" +#include "handle_mmaps.h" +#include "tbl_mmaps.h" +#include "timers.h" +#include "file.h" +#include "fsync.h" +#include "relink.h" +#include "add_delay.h" + +static uint8_t clearing_app_log; +static uint8_t clearing_op_log; +static loff_t app_log_tail; +static loff_t op_log_tail; +static loff_t app_log_lim; +static loff_t op_log_lim; +static int app_log_fd; +static int op_log_fd; +static unsigned long app_log; +static unsigned long op_log; +static struct inode_path *ino_path_head; + +uint32_t crc32_for_byte(uint32_t r) { + for(int j = 0; j < 8; ++j) + r = (r & 1? 0: (uint32_t)0xEDB88320L) ^ r >> 1; + return r ^ (uint32_t)0xFF000000L; +} + +void create_crc32(const void *data, size_t n_bytes, uint32_t* crc) { + static uint32_t table[0x100]; + if(!*table) + for(size_t i = 0; i < 0x100; ++i) + table[i] = crc32_for_byte(i); + for(size_t i = 0; i < n_bytes; ++i) + *crc = table[(uint8_t)*crc ^ ((uint8_t*)data)[i]] ^ *crc >> 8; +} + + +void init_logs() { + int i = 0, ret = 0; + unsigned long num_blocks = APPEND_LOG_SIZE / MMAP_PAGE_SIZE; + char prefault_buf[MMAP_PAGE_SIZE]; + + clearing_app_log = 0; + clearing_op_log = 0; + app_log_tail = 0; + op_log_tail = 0; + app_log_lim = APPEND_LOG_SIZE; + op_log_lim = OP_LOG_SIZE; + app_log_fd = -1; + op_log_fd = -1; + app_log = 0; + op_log = 0; + + MSG("%s: Initializing append and op log\n", __func__); + + app_log_fd = syscall_no_intercept(SYS_open, APPEND_LOG_PATH, O_RDWR | O_CREAT, 0666); + if (app_log_fd < 0) { + MSG("%s: Creation of append log file failed. Err = %s\n", + __func__, strerror(-app_log_fd)); + assert(0); + } + op_log_fd = syscall_no_intercept(SYS_open, OP_LOG_PATH, O_RDWR | O_CREAT, 0666); + if (op_log_fd < 0) { + MSG("%s: Creation of op log file failed. Err = %s\n", + __func__, strerror(-op_log_fd)); + assert(0); + } + + ret = posix_fallocate(app_log_fd, 0, APPEND_LOG_SIZE); + if (ret < 0) { + MSG("%s: posix_fallocate append long failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + ret = posix_fallocate(op_log_fd, 0, OP_LOG_SIZE); + if (ret < 0) { + MSG("%s: posix_fallocate op log failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + + app_log = (unsigned long) FSYNC_MMAP + ( + NULL, + APPEND_LOG_SIZE, + PROT_READ | PROT_WRITE, //max_perms, + MAP_SHARED | MAP_POPULATE, + app_log_fd, //fd_with_max_perms, + 0 + ); + + op_log = (unsigned long) FSYNC_MMAP + ( + NULL, + OP_LOG_SIZE, + PROT_READ | PROT_WRITE, //max_perms, + MAP_SHARED | MAP_POPULATE, + op_log_fd, //fd_with_max_perms, + 0 + ); + + for (i = 0; i < MMAP_PAGE_SIZE; i++) + prefault_buf[i] = '0'; + + for (i = 0; i < num_blocks; i++) { + if(MEMCPY_NON_TEMPORAL((void *)app_log + i*MMAP_PAGE_SIZE, + prefault_buf, + MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy app log failed\n", __func__); + assert(0); + } + if(MEMCPY_NON_TEMPORAL((void *)op_log + i*MMAP_PAGE_SIZE, + prefault_buf, + MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy op log failed\n", __func__); + assert(0); + } + } +} + +void init_append_log() { + + int i = 0, ret = 0; + unsigned long num_blocks = APPEND_LOG_SIZE / MMAP_PAGE_SIZE; + char prefault_buf[MMAP_PAGE_SIZE]; + + clearing_app_log = 0; + app_log_tail = 0; + app_log_lim = APPEND_LOG_SIZE; + app_log_fd = -1; + app_log = 0; + + MSG("%s: Initializing append log\n", __func__); + + app_log_fd = syscall_no_intercept(SYS_open, APPEND_LOG_PATH, O_RDWR | O_CREAT, 0666); + if (app_log_fd < 0) { + MSG("%s: Creation of append log file failed. Err = %s\n", + __func__, strerror(-app_log_fd)); + assert(0); + } + + ret = posix_fallocate(app_log_fd, 0, APPEND_LOG_SIZE); + if (ret < 0) { + MSG("%s: posix_fallocate append long failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + + app_log = (unsigned long) FSYNC_MMAP + ( + NULL, + APPEND_LOG_SIZE, + PROT_READ | PROT_WRITE, //max_perms, + MAP_SHARED | MAP_POPULATE, + app_log_fd, //fd_with_max_perms, + 0 + ); + + for (i = 0; i < MMAP_PAGE_SIZE; i++) + prefault_buf[i] = '0'; + + for (i = 0; i < num_blocks; i++) { +#if NON_TEMPORAL_WRITES + if(MEMCPY_NON_TEMPORAL((void *)app_log + i*MMAP_PAGE_SIZE, + prefault_buf, + MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy app log failed\n", __func__); + assert(0); + } +#else // NON_TEMPORAL_WRITES + if(FSYNC_MEMCPY((char *)app_log + i*MMAP_PAGE_SIZE, + prefault_buf, + MMAP_PAGE_SIZE) == NULL) { + MSG("%s: temporal memcpy app log failed\n", __func__); + assert(0); + } +#endif // NON_TEMPORAL_WRITES + } +} + +static void sync_and_clear_app_log() { + int i = 0, ret = -1; + struct NVFile *nvf = NULL; + int cpuid = GET_CPUID(); + instrumentation_type append_log_reinit_time; + + START_TIMING(append_log_reinit_t, append_log_reinit_time); + for (i = 3; i < OPEN_MAX; i++) { + nvf = &_nvp_fd_lookup[i]; + NVP_LOCK_FD_RD(nvf, cpuid); + DEBUG_FILE("%s: Calling dynamic remap, because app log is full\n", __func__); + if (nvf->fd > 0 && nvf->valid && !nvf->posix && nvf->node) { + NVP_LOCK_NODE_WR(nvf); + /* [TODO] Do Some checks to see if + * there are appends, and if there are, + * perform the dynamic remap system call. + */ + if (nvf->node->true_length != nvf->node->length) + perform_dynamic_remap(nvf); + NVP_UNLOCK_NODE_WR(nvf); + } + DEBUG_FILE("%s: File %i synced. OPEN_MAX = %i\n", __func__, i, OPEN_MAX); + NVP_UNLOCK_FD_RD(nvf, cpuid); + } + // memset((void *)app_log, 0, APPEND_LOG_SIZE); + __sync_bool_compare_and_swap(&app_log_tail, app_log_tail, 0); + __sync_bool_compare_and_swap(&clearing_app_log, 1, 0); + END_TIMING(append_log_reinit_t, append_log_reinit_time); +} + +static void sync_and_clear_op_log() { + int i = 0, ret = -1; + struct NVFile *nvf = NULL; + int cpuid = GET_CPUID(); + + for (i = 3; i < OPEN_MAX; i++) { + nvf = &_nvp_fd_lookup[i]; + NVP_LOCK_FD_RD(nvf, cpuid); + if (nvf->valid) { + ret = syncfs(nvf->fd); + if (ret != 0) { + DEBUG_FILE("%s: Syncfs failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + } + NVP_UNLOCK_FD_RD(nvf, cpuid); + } + memset((void *)op_log, 0, OP_LOG_SIZE); + __sync_bool_compare_and_swap(&op_log_tail, op_log_tail, 0); + __sync_bool_compare_and_swap(&clearing_op_log, 1, 0); +} + +void ledger_op_log_recovery() { + + int flags = 0, ret = 0; + char fname1[256], fname2[256]; + struct op_log_entry op_entry; + uint32_t computed_checksum = 0; + op_log_tail = 0; + + while(op_log_tail < op_log_lim) { + if (app_log + app_log_tail == '0') + goto end; + memcpy(&op_entry, + (void *) (op_log + op_log_tail), + OP_LOG_ENTRY_SIZE); + + switch (op_entry.op_type) { + case LOG_DIR_CREATE: + memcpy(fname1, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE), + op_entry.file1_size + ); + create_crc32((void *) &(op_entry.entry_size), + op_entry.entry_size, + &(computed_checksum)); + if (computed_checksum != op_entry.checksum) { + DEBUG_FILE("%s: checksum missmatch\n", __func__); + return; + } + ret = access(fname1, F_OK); + if (ret == 0) + goto next; + if (ret != 0) { + if (errno != ENOENT) { + MSG("%s: access failed for file %s\n", + __func__, fname1); + assert(0); + } + } + ret = mkdir(fname1, op_entry.mode); + if (ret != 0) { + MSG("%s: mkdir failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + break; + case LOG_RENAME: + memcpy(fname1, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE), + op_entry.file1_size + ); + memcpy(fname2, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE + op_entry.file1_size), + op_entry.file2_size + ); + create_crc32((void *) &(op_entry.entry_size), + op_entry.entry_size, + &(computed_checksum)); + if (computed_checksum != op_entry.checksum) { + DEBUG_FILE("%s: checksum missmatch\n", __func__); + return; + } + ret = access(fname2, F_OK); + if (ret == 0) + goto next; + if (ret != 0) { + if (errno != ENOENT) { + MSG("%s: access failed for file %s\n", + __func__, fname2); + assert(0); + } + } + ret = rename(fname1, fname2); + if (ret != 0) { + MSG("%s: rename failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + break; + case LOG_LINK: + memcpy(fname1, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE), + op_entry.file1_size + ); + memcpy(fname2, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE + op_entry.file1_size), + op_entry.file2_size + ); + create_crc32((void *) &(op_entry.entry_size), + op_entry.entry_size, + &(computed_checksum)); + if (computed_checksum != op_entry.checksum) { + DEBUG_FILE("%s: checksum missmatch\n", __func__); + return; + } + ret = access(fname2, F_OK); + if (ret == 0) + goto next; + if (ret != 0) { + if (errno != ENOENT) { + MSG("%s: access failed for file %s\n", + __func__, fname2); + assert(0); + } + } + ret = link(fname1, fname2); + if (ret != 0) { + MSG("%s: link failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + break; + case LOG_SYMLINK: + memcpy(fname1, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE), + op_entry.file1_size + ); + memcpy(fname2, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE + op_entry.file1_size), + op_entry.file2_size + ); + create_crc32((void *) &(op_entry.entry_size), + op_entry.entry_size, + &(computed_checksum)); + if (computed_checksum != op_entry.checksum) { + DEBUG_FILE("%s: checksum missmatch\n", __func__); + return; + } + ret = access(fname2, F_OK); + if (ret == 0) + goto next; + if (ret != 0) { + if (errno != ENOENT) { + MSG("%s: access failed for file %s\n", + __func__, fname2); + assert(0); + } + } + ret = symlink(fname1, fname2); + if (ret != 0) { + MSG("%s: symlink failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + break; + case LOG_DIR_DELETE: + memcpy(fname1, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE), + op_entry.file1_size + ); + create_crc32((void *) &(op_entry.entry_size), + op_entry.entry_size, + &(computed_checksum)); + if (computed_checksum != op_entry.checksum) { + DEBUG_FILE("%s: checksum missmatch\n", __func__); + return; + } + ret = access(fname1, F_OK); + if (ret != 0) { + if (errno != ENOENT) { + MSG("%s: access failed for file %s\n", + __func__, fname1); + assert(0); + } + else + goto next; + } + ret = rmdir(fname1); + if (ret != 0) { + MSG("%s: rmdir failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + break; + case LOG_FILE_CREATE: + memcpy(fname1, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE), + op_entry.file1_size + ); + create_crc32((void *) &(op_entry.entry_size), + op_entry.entry_size, + &(computed_checksum)); + if (computed_checksum != op_entry.checksum) { + DEBUG_FILE("%s: checksum missmatch\n", __func__); + return; + } + ret = access(fname1, F_OK); + if (ret == 0) + goto next; + if (ret != 0) { + if (errno != ENOENT) { + MSG("%s: access failed for file %s\n", + __func__, fname1); + assert(0); + } + } + ret = open(fname1, op_entry.flags, op_entry.mode); + if (ret < 0) { + MSG("%s: create file failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + break; + case LOG_FILE_UNLINK: + memcpy(fname1, + (void *) (op_log + op_log_tail + OP_LOG_ENTRY_SIZE), + op_entry.file1_size + ); + create_crc32((void *) &(op_entry.entry_size), + op_entry.entry_size, + &(computed_checksum)); + if (computed_checksum != op_entry.checksum) { + DEBUG_FILE("%s: checksum missmatch\n", __func__); + return; + } + ret = access(fname1, F_OK); + if (ret != 0) { + if (errno != ENOENT) { + MSG("%s: access failed for file %s\n", + __func__, fname1); + assert(0); + } + else + goto next; + } + ret = unlink(fname1); + if (ret != 0) { + MSG("%s: unlink failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + break; + } + + next: + op_log_tail += op_entry.entry_size; + if (op_log_tail % CLFLUSH_SIZE != 0) { + op_log_tail += (CLFLUSH_SIZE - (op_log_tail % CLFLUSH_SIZE)); + } + } + + end: + DEBUG_FILE("%s: Op log recovery completed successfully\n", + __func__); +} + +static int ino_path_info(const char *fpath, + const struct stat *sb, + int tflag, + struct FTW *ftwbuf) { + + struct inode_path *ino_path = NULL; + if (tflag != FTW_F) + return 0; + if (ino_path_head == NULL) { + ino_path_head = (struct inode_path *) malloc(sizeof(struct inode_path)); + ino_path = ino_path_head; + } else { + ino_path = ino_path_head; + while (ino_path->next != NULL) + ino_path = ino_path->next; + ino_path->next = (struct inode_path *) malloc(sizeof(struct inode_path)); + ino_path = ino_path->next; + } + strcpy(ino_path->path, fpath); + ino_path->file_ino = sb->st_ino; + ino_path->file_size = sb->st_size; + ino_path->next = NULL; + return 0; +} + +static void get_relevant_file(struct inode_path *ino_path_file, + ino_t file_ino) { + struct inode_path *ino_path = NULL; + + if (ino_path_head == NULL) { + ino_path_file->file_ino = 0; + return; + } + + ino_path = ino_path_head; + while (ino_path != NULL) { + if (ino_path->file_ino == file_ino) { + ino_path_file->file_ino = ino_path->file_ino; + ino_path_file->file_size = ino_path->file_size; + strcpy(ino_path_file->path, ino_path->path); + ino_path_file->next = NULL; + return; + } else + ino_path = ino_path->next; + } +} + +void ledger_append_log_recovery() { + + int flags = 0, ret = 0, file_fd = 0, dr_fd = 0; + unsigned long dr_addr = 0; + struct inode_path ino_path_file, ino_path_dr; + struct append_log_entry app_entry; + uint32_t computed_checksum = 0; + ino_path_head = NULL; + app_log_tail = 0; + + ret = nftw(NVMM_PATH, ino_path_info, 20, 0); + if (ret == -1) { + MSG("%s: nftw failed. Err = %s\n", __func__, strerror(errno)); + assert(0); + } + + while(app_log_tail < app_log_lim) { + if (app_log + app_log_tail == '0') + goto end; + memcpy(&app_entry, + (void *) (app_log + app_log_tail), + APPEND_LOG_ENTRY_SIZE); + create_crc32((void *) &(app_entry.file_ino), 32, &computed_checksum); + if (computed_checksum != app_entry.checksum) { + DEBUG_FILE("%s: checksum missmatch\n", __func__); + return; + } + get_relevant_file(&ino_path_file, app_entry.file_ino); + get_relevant_file(&ino_path_dr, app_entry.dr_ino); + if (ino_path_file.file_ino == 0 || ino_path_dr.file_ino == 0) + goto next; + if ((ino_path_file.file_size != app_entry.file_offset) || + (ino_path_dr.file_size < app_entry.dr_offset + app_entry.data_size)) + goto next; + // Open file X, file DR. + file_fd = open(ino_path_file.path, O_RDWR); + if (file_fd < 0) { + MSG("%s: Open failed for path %s\n", + __func__, ino_path_file.path); + assert(0); + } + dr_fd = open(ino_path_dr.path, O_RDWR); + if (dr_fd < 0) { + MSG("%s: Open failed for path %s\n", + __func__, ino_path_dr.path); + assert(0); + } + + // MAP DR file. + dr_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + ino_path_dr.file_size, + PROT_READ | PROT_WRITE, //max_perms, + MAP_SHARED | MAP_POPULATE, + dr_fd, //fd_with_max_perms, + 0 + ); + if (dr_addr == 0) { + MSG("%s: mmap failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + + // Do dynamic remap between file X and DR file or simply copy data. + ret = syscall(335, file_fd, + dr_fd, + app_entry.file_offset, + app_entry.dr_offset, + (const char *)dr_addr, + app_entry.data_size); + if (ret < 0) { + MSG("%s: Dynamic remap called failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + + // Close file X and DR file. + ret = munmap((void *) dr_addr, ino_path_dr.file_size); + if (ret < 0) { + MSG("%s: unmap of dr file failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + close(dr_fd); + close(file_fd); + next: + app_log_tail += APPEND_LOG_ENTRY_SIZE; + } + + end: + DEBUG_FILE("%s: Append log recovery completed successfully\n", + __func__); +} + +void persist_append_entry(uint32_t file_ino, + uint32_t dr_ino, + loff_t file_off, + loff_t dr_off, + size_t size) { + loff_t log_off; + struct append_log_entry app_entry; + fetch_and_add: + log_off = __sync_fetch_and_add(&app_log_tail, APPEND_LOG_ENTRY_SIZE); + if (app_log_tail > app_log_lim) { + if (__sync_bool_compare_and_swap(&clearing_app_log, 0, 1)) + sync_and_clear_app_log(); + goto fetch_and_add; + } + app_entry.file_ino = file_ino; + app_entry.dr_ino = dr_ino; + app_entry.file_offset = file_off; + app_entry.dr_offset = dr_off; + app_entry.data_size = size; + create_crc32((void *) &(app_entry.file_ino), 32, &(app_entry.checksum)); + //app_entry.checksum = checksum; // [TODO] Calculate Checksum + MEMCPY_NON_TEMPORAL((void *)app_log + log_off, + &app_entry, + APPEND_LOG_ENTRY_SIZE); + _mm_sfence(); +#if NVM_DELAY + perfmodel_add_delay(0, APPEND_LOG_ENTRY_SIZE); +#endif +} + +void persist_op_entry(uint32_t op_type, + const char *fname1, + const char *fname2, + uint32_t mode, + uint32_t flags) { + loff_t log_off; + size_t padding = 0; + struct op_log_entry op_entry; + + DEBUG_FILE("%s: START\n", __func__); + + op_entry.entry_size = OP_LOG_ENTRY_SIZE + strlen(fname1); + op_entry.file1_size = strlen(fname1); + if (fname2 != NULL) { + op_entry.entry_size += strlen(fname2); + op_entry.file2_size = strlen(fname2); + } + if (op_entry.entry_size % CLFLUSH_SIZE != 0) + padding = CLFLUSH_SIZE - (op_entry.entry_size % CLFLUSH_SIZE); + + fetch_and_add: + log_off = __sync_fetch_and_add(&op_log_tail, op_entry.entry_size + padding); + if (op_log_tail > op_log_lim) { + if (__sync_bool_compare_and_swap(&clearing_op_log, 0, 1)) + sync_and_clear_op_log(); + goto fetch_and_add; + } + + op_entry.op_type = op_type; + op_entry.mode = mode; + op_entry.flags = flags; + create_crc32((void *) &(op_entry.entry_size), op_entry.entry_size, &(op_entry.checksum)); + + DEBUG_FILE("%s: Got the checksum. log_off = %lu, op_log = %lu\n", __func__, log_off, op_log); + + //op_entry.checksum = checksum; // [TODO] Calculate Checksum + MEMCPY_NON_TEMPORAL((void *)op_log + log_off, + &op_entry, + OP_LOG_ENTRY_SIZE); +#if NVM_DELAY + perfmodel_add_delay(0, OP_LOG_ENTRY_SIZE); +#endif + log_off += OP_LOG_ENTRY_SIZE; + MEMCPY_NON_TEMPORAL((void *)op_log + log_off, + fname1, + strlen(fname1)); +#if NVM_DELAY + perfmodel_add_delay(0, strlen(fname1)); +#endif + log_off += strlen(fname1); + if (fname2 != NULL) { + MEMCPY_NON_TEMPORAL((void *)op_log + log_off, + fname2, + strlen(fname2)); +#if NVM_DELAY + perfmodel_add_delay(0, strlen(fname2)); +#endif + log_off += strlen(fname2); + } + if (padding != 0) { + char padstr[padding]; + MEMCPY_NON_TEMPORAL((void *)op_log + log_off, + padstr, + padding); +#if NVM_DELAY + perfmodel_add_delay(0, padding); +#endif + } + + _mm_sfence(); + +} + +void init_op_log() { + + int i = 0, ret = 0; + unsigned long num_blocks = APPEND_LOG_SIZE / MMAP_PAGE_SIZE; + char prefault_buf[MMAP_PAGE_SIZE]; + + clearing_op_log = 0; + op_log_tail = 0; + op_log_lim = OP_LOG_SIZE; + op_log_fd = -1; + op_log = 0; + + + op_log_fd = syscall_no_intercept(SYS_open, OP_LOG_PATH, O_RDWR | O_CREAT, 0666); + if (op_log_fd < 0) { + MSG("%s: Creation of op log file failed. Err = %s\n", + __func__, strerror(-op_log_fd)); + assert(0); + } + + ret = posix_fallocate(op_log_fd, 0, OP_LOG_SIZE); + if (ret < 0) { + MSG("%s: posix_fallocate op log failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + + op_log = (unsigned long) FSYNC_MMAP + ( + NULL, + OP_LOG_SIZE, + PROT_READ | PROT_WRITE, //max_perms, + MAP_SHARED | MAP_POPULATE, + op_log_fd, //fd_with_max_perms, + 0 + ); + + for (i = 0; i < MMAP_PAGE_SIZE; i++) + prefault_buf[i] = '0'; + + for (i = 0; i < num_blocks; i++) { +#if NON_TEMPORAL_WRITES + if(MEMCPY_NON_TEMPORAL((void *)op_log + i*MMAP_PAGE_SIZE, + prefault_buf, + MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy op log failed\n", __func__); + assert(0); + } +#else // NON_TEMPORAL_WRITES + if(FSYNC_MEMCPY((char *)op_log + i*MMAP_PAGE_SIZE, + prefault_buf, + MMAP_PAGE_SIZE) == NULL) { + MSG("%s: temporal memcpy op log failed\n", __func__); + assert(0); + } +#endif // NON_TEMPORAL_WRITES + } +} diff --git a/splitfs_syscall_intercept/src/log.h b/splitfs_syscall_intercept/src/log.h new file mode 100644 index 0000000000..ed52ff0c1e --- /dev/null +++ b/splitfs_syscall_intercept/src/log.h @@ -0,0 +1,69 @@ +// Header file for LEDGER Logging + +#ifndef _LEDGER_LOG_H_ +#define _LEDGER_LOG_H_ + +#include +#include + +#define APPEND_LOG_ENTRY_SIZE 64 +#define OP_LOG_ENTRY_SIZE 37 +#define APPEND_LOG_PATH "/mnt/pmem_emul/append.log" +#define OP_LOG_PATH "/mnt/pmem_emul/operation.log" +#define APPEND_LOG_SIZE (128*1024*1024) +#define OP_LOG_SIZE (128*1024*1024) + +enum log_types { + LOG_DIR_CREATE, + LOG_RENAME, + LOG_LINK, + LOG_SYMLINK, + LOG_DIR_DELETE, + LOG_FILE_APPEND, + LOG_FILE_CREATE, + LOG_FILE_UNLINK, + LOG_TYPES_NUM, +}; + +struct append_log_entry { + uint32_t checksum; + uint32_t file_ino; + uint32_t dr_ino; + loff_t file_offset; + loff_t dr_offset; + size_t data_size; + uint8_t padding[28]; +}; + +struct op_log_entry { + uint32_t checksum; + size_t entry_size; + size_t file1_size; + size_t file2_size; + uint8_t op_type; + uint32_t mode; + uint32_t flags; +}; + +struct inode_path { + uint32_t file_ino; + size_t file_size; + char path[256]; + struct inode_path *next; +}; + +void init_logs(); +void init_op_log(); +void init_append_log(); +void persist_op_entry(uint32_t op_type, + const char *fname1, + const char *fname2, + uint32_t mode, + uint32_t flags); +void persist_append_entry(uint32_t file_ino, + uint32_t dr_ino, + loff_t file_off, + loff_t dr_off, + size_t size); + +#endif diff --git a/splitfs_syscall_intercept/src/lru_cache.c b/splitfs_syscall_intercept/src/lru_cache.c new file mode 100644 index 0000000000..d27ff00383 --- /dev/null +++ b/splitfs_syscall_intercept/src/lru_cache.c @@ -0,0 +1,217 @@ +#include "lru_cache.h" +#include "stack.h" + + +int insert_in_seq_list(struct ClosedFiles *node, ino_t *stale_serialno, int fd, ino_t serialno) { + + int stale_fd = -1; + + if (node->fd != -1) { + stale_fd = node->fd; + *stale_serialno = node->serialno; + } + + node->fd = fd; + node->serialno = serialno; + + return stale_fd; +} + + +/* + * Insert a new node in the LRU cache at the head position + */ +int insert_in_lru_list(int fd, ino_t serialno, ino_t *stale_serialno) { + + struct ClosedFiles *node = NULL; + struct ClosedFiles *node_to_be_removed = NULL; + int stale_fd = -1; + int hash_index = -1; + int idx_in_free_list = -1; + + LRU_LOCK_HEAD_WR(); + + idx_in_free_list = pop_from_stack(0, 1, -1); + if (idx_in_free_list != -1) { + + node = (struct ClosedFiles *)&_nvp_closed_files[idx_in_free_list]; + node->fd = fd; + node->serialno = serialno; + node->index_in_free_list = idx_in_free_list; + node->next_closed_file = -1; + node->prev_closed_file = -1; + + if (lru_head == -1) { + lru_head = node->index_in_free_list; + lru_tail = node->index_in_free_list; + lru_tail_serialno = node->serialno; + } else if (lru_tail == -1) { + lru_tail = lru_head; + lru_tail_serialno = _nvp_closed_files[lru_tail].serialno; + } else { + node->next_closed_file = lru_head; + _nvp_closed_files[lru_head].prev_closed_file = node->index_in_free_list; + lru_head = node->index_in_free_list; + } + + hash_index = serialno % 1024; + + if (inode_to_closed_file[hash_index].index != -1) { + node_to_be_removed = (struct ClosedFiles *)&_nvp_closed_files[inode_to_closed_file[hash_index].index]; + stale_fd = remove_from_lru_list_hash(node_to_be_removed->serialno, 1); + *stale_serialno = node_to_be_removed->serialno; + } + + inode_to_closed_file[hash_index].index = node->index_in_free_list; + } + + LRU_UNLOCK_HEAD_WR(); + + return stale_fd; +} + +/* + * Remove a node from the LRU cache, searching based on inode number + */ +int remove_from_lru_list_hash(ino_t serialno, int lock_held) { + + int hash_index = -1, fd = -1, prev_node_idx = -1, next_node_idx = -1; + struct ClosedFiles *node = NULL, *prev_node = NULL, *next_node = NULL; + int lock_set = 0; + + hash_index = serialno % 1024; + + if (!lock_held) { + LRU_LOCK_HEAD_WR(); + lock_set = 1; + } + + if (inode_to_closed_file[hash_index].index != -1) { + + node = (struct ClosedFiles *)&_nvp_closed_files[inode_to_closed_file[hash_index].index]; + + if (node->serialno == serialno) { + + prev_node_idx = node->prev_closed_file; + next_node_idx = node->next_closed_file; + if (prev_node_idx >= 0) { + prev_node = (struct ClosedFiles *)&_nvp_closed_files[prev_node_idx]; + } + + if (next_node_idx >= 0) { + next_node = (struct ClosedFiles *)&_nvp_closed_files[next_node_idx]; + } + + fd = node->fd; + + if (prev_node_idx != -1) + prev_node->next_closed_file = next_node_idx; + if (next_node_idx != -1) + next_node->prev_closed_file = prev_node_idx; + if (node->index_in_free_list == lru_head) { + lru_head = node->next_closed_file; + _nvp_closed_files[lru_head].prev_closed_file = -1; + } + if (node->index_in_free_list == lru_tail) { + lru_tail = node->prev_closed_file; + lru_tail_serialno = _nvp_closed_files[lru_tail].serialno; + _nvp_closed_files[lru_tail].next_closed_file = -1; + } + + node->prev_closed_file = -1; + node->next_closed_file = -1; + node->fd = -1; + node->serialno = 0; + + if(!lock_held) { + LRU_UNLOCK_HEAD_WR(); + lock_set = 0; + } + + inode_to_closed_file[hash_index].index = -1; + push_in_stack(0, 1, node->index_in_free_list, -1); + } + } + + if(!lock_held && lock_set) { + LRU_UNLOCK_HEAD_WR(); + lock_set = 0; + } + + return fd; +} + +/* + * Remove a node from LRU cache based on LRU policy for background thread + */ +int remove_from_lru_list_policy(ino_t *serialno) { + + int hash_index = -1; + ino_t local_serialno = 0; + int fd = -1; + struct ClosedFiles *node = NULL; + + LRU_LOCK_HEAD_WR(); + + node = (struct ClosedFiles *)&_nvp_closed_files[lru_tail]; + + local_serialno = node->serialno; + hash_index = local_serialno % 1024; + + lru_tail = node->prev_closed_file; + if (lru_tail != -1) { + lru_tail_serialno = _nvp_closed_files[lru_tail].serialno; + _nvp_closed_files[lru_tail].next_closed_file = -1; + } else + lru_tail_serialno = 0; + + fd = node->fd; + *serialno = local_serialno; + + node->next_closed_file = -1; + node->prev_closed_file = -1; + node->fd = -1; + node->serialno = 0; + + inode_to_closed_file[hash_index].index = -1; + push_in_stack(0, 1, node->index_in_free_list, -1); + + LRU_UNLOCK_HEAD_WR(); + + return fd; +} + +/* + * Remove a node from LRU cache based on LRU policy for background thread + */ +int remove_from_seq_list(struct ClosedFiles *node, ino_t *serialno) { + + int fd = -1; + + fd = node->fd; + *serialno = node->serialno; + + node->next_closed_file = -1; + node->prev_closed_file = -1; + node->fd = -1; + node->serialno = 0; + + return fd; +} + + +int remove_from_seq_list_hash(struct ClosedFiles *node, ino_t serialno) { + + int fd = -1; + + if(node->serialno == serialno) { + fd = node->fd; + node->next_closed_file = -1; + node->prev_closed_file = -1; + node->fd = -1; + node->serialno = 0; + } + + return fd; +} + diff --git a/splitfs_syscall_intercept/src/lru_cache.h b/splitfs_syscall_intercept/src/lru_cache.h new file mode 100644 index 0000000000..0ad0205f39 --- /dev/null +++ b/splitfs_syscall_intercept/src/lru_cache.h @@ -0,0 +1,75 @@ +#ifndef LEDGER_SRC_LRU_H_ +#define LEDGER_SRC_LRU_H_ + +#include "file.h" +// #include "stack.h" +#include + +/* + * Declare the structure that will hold information of the files that are to be closed + */ +struct ClosedFiles { + NVP_LOCK_DECL; + int fd; + ino_t serialno; + int index_in_free_list; + int next_closed_file; + int prev_closed_file; +}; + +struct InodeClosedFile { + NVP_LOCK_DECL; + int index; +}; + +#define LRU_NODE_LOCKING 1 +#if LRU_NODE_LOCKING + +#define LRU_NODE_LOCK_WR(cnode) NVP_LOCK_WR(cnode->lock) +#define LRU_NODE_UNLOCK_WR(cnode) NVP_LOCK_UNLOCK_WR(cnode->lock) + +#else + +#define LRU_NODE_LOCK_WR(cnode) {(void)(cnode->lock);} +#define LRU_NODE_UNLOCK_WR(cnode) {(void)(cnode->lock);} + +#endif + +#define LRU_HEAD_LOCKING 0 +#if LRU_HEAD_LOCKING + +#define LRU_LOCK_HEAD_WR() {pthread_spin_lock(&global_lock_lru_head);} +#define LRU_UNLOCK_HEAD_WR() {pthread_spin_unlock(&global_lock_lru_head);} + +#else + +#define LRU_LOCK_HEAD_WR() {(void)(global_lock_lru_head);} +#define LRU_UNLOCK_HEAD_WR() {(void)(global_lock_lru_head);} + +#endif + +/* + * Declare the hash table that will map inode number to the node in the LRU list + */ +struct InodeClosedFile *inode_to_closed_file; +atomic_uint_fast64_t dr_mem_closed_files; +atomic_uint_fast64_t dr_mem_allocated; +atomic_uint_fast64_t num_files_closed; +pthread_spinlock_t global_lock_lru_head; + +/* + * Global variables to hold the head and tail of LRU list + */ +struct ClosedFiles *_nvp_closed_files; +int lru_head; +int lru_tail; +int lru_tail_serialno; + +int insert_in_seq_list(struct ClosedFiles *node, ino_t *stale_serialno, int fd, ino_t serialno); +int insert_in_lru_list(int fd, ino_t serialno, ino_t *stale_serialno); +int remove_from_lru_list_hash(ino_t serialno, int lock_held); +int remove_from_lru_list_policy(ino_t *serialno); +int remove_from_seq_list(struct ClosedFiles *node, ino_t *serialno); +int remove_from_seq_list_hash(struct ClosedFiles *node, ino_t serialno); + +#endif diff --git a/splitfs_syscall_intercept/src/make-hs.mk b/splitfs_syscall_intercept/src/make-hs.mk new file mode 100755 index 0000000000..9ce0c0c1f3 --- /dev/null +++ b/splitfs_syscall_intercept/src/make-hs.mk @@ -0,0 +1,329 @@ + + +define \n + + +endef + + + +# nanodd # + +NDD_RUNTIME = 80 + +NDD_TARGET_DIR = /tmp/memuram0 +NDD_TARGET_FILE = xddtestfile.txt +NDD_OUT_DIR = ~/results + +NDDEXEC = $(BEE3HOME)/Workloads/xdd/bin/xdd.linux +NDDFLAGS = -noproclock -nomemlock -reqsize 1 -dio -seek random -id $(NDD_LOG_FILENAME) -runtime $(NDD_RUNTIME) + +NDD_WRAPPER_DIR = $(BEE3HOME)/Tools/PosixNVM/bin +NDD_WRAPPER_SCRIPTS = nvpUnmod nvpNVP +NDD_RATIOS = 0 25 50 75 100 +NDD_NUM_THREADS = 1 4 16 64 +NDD_FILE_LENS_KB = 57671680 +NDD_REQ_SIZE = 512 4096 131072 +NDD_LOG_FILENAME = $(SCRIPT)-tc$(THREADS)-rw$(RATIO)-filelen$(FILELEN)k-reqsize$(REQSIZE).result + +NDD_AFFINITY_1 = taskset -c 0,2,4,6 +NDD_AFFINITY_2 = taskset -c 0,2,4,6 +NDD_AFFINITY_4 = taskset -c 0,2,4,6 +NDD_AFFINITY_8 = taskset -c 0,2,4,6,8,10,12,14 +NDD_AFFINITY_16= taskset -c 0-15 +NDD_AFFINITY_32= taskset -c 0-15 +NDD_AFFINITY_64= taskset -c 0-15 +NDD_AFFINITY_12= taskset -c 0-15 +NDD_AFFINITY_24= taskset -c 0-15 + +SHELL = /bin/bash + +NDD_G1_DIR=2012-02-13-01 +NDD_G2_DIR=09 + +test_prep_ramdisk: + -sudo umount /tmp/memuram0 + -sudo rmmod memudisk + sudo insmod $(BEE3HOME)/Tools/KernelModules/Ramdisk/memudisk.ko rd_size=$$[64*1024*1024] max_part=1 rd_nr=1 + sleep 2 + sudo chmod 777 /dev/memuram0 + sudo mke2fs /dev/memuram0 -b 4096 + sudo mount /dev/memuram0 /tmp/memuram0 -o xip + sudo chmod 777 /tmp/memuram0 + -dd if=/dev/zero of=$(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) bs=$$[4096*16] count=$$[59055800320/(4096*16)] oflag=direct + sleep 10 + +test_prep_ramdisk_3: + -sudo umount /tmp/memuram0 + -sudo rmmod brd + sudo insmod /lib/modules/3.2.1-io/kernel/drivers/block/brd.ko rd_size=$$[64*1024*1024] max_part=1 rd_nr=1 + sleep 2 + sudo chmod 777 /dev/ram0 + sudo mke2fs /dev/ram0 -b 4096 + sudo mount /dev/ram0 /tmp/memuram0 -o xip + sudo chmod 777 /tmp/memuram0 + -dd if=/dev/zero of=$(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) bs=$$[4096*16] count=$$[59055800320/(4096*16)] oflag=direct + sleep 10 + +test_prep_ramdisk_hugepage: + -sudo umount /tmp/memuram0 + -sudo umount /tmp/memuram0 + #-sudo rmmod brd + #sudo insmod /lib/modules/3.2.1-io/kernel/drivers/block/brd.ko rd_size=$$[64*1024*1024] max_part=1 rd_nr=1 + #sleep 2 + #sudo chmod 777 /dev/ram0 + #sudo mke2fs /dev/ram0 -b 4096 + #sudo mount /dev/ram0 /tmp/memuram0 -o xip + #sudo chmod 777 /tmp/memuram0 + #-dd if=/dev/zero of=$(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) bs=$$[2*1024*1024] count=$$[59055800320/(2*1024*1024/4)] oflag=direct + #sleep 2 + sudo mount -t hugetlbfs -o rw,pagesize=2M,mode=0777 none /tmp/memuram0 + sudo chmod 777 /tmp/memuram0 + sleep 2 + +break_ramdisk: + /homes/leisner/bee3/Tools/PosixNVM/bin/nvpNVP /homes/leisner/bee3/Workloads/fastdd/fastdd.exe asdf -footKB 57671680 -r 0 -s 512 -tc 32 -file /tmp/memuram0/xddtestfile.txt -rt 60 + +HSG1_CMD=$(NDD_AFFINITY_$(THREADS)) $(NDD_WRAPPER_DIR)/$(SCRIPT) $(NDDEXEC) $(NDDFLAGS) -queuedepth $(THREADS) -kbytes $(FILELEN) -rwratio $(RATIO) -blocksize $(REQSIZE) -seek range $$[($(FILELEN)*1024)/$(REQSIZE)] -target $(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) + +HSG1_MEGADD_CMD=$(NDD_AFFINITY_$(THREADS)) $(NDD_WRAPPER_DIR)/$(SCRIPT) ./megadd $(NDD_RUNTIME) $(FILELEN) $(REQSIZE) $(RATIO) + +HSG1_FASTDD_CMD=$(NDD_AFFINITY_$(THREADS)) $(NDD_WRAPPER_DIR)/$(SCRIPT) ${BEE3HOME}/Workloads/fastdd/fastdd.exe asdf -footKB $(FILELEN) -r $(RATIO) -s $(REQSIZE) -tc $(THREADS) -file $(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) -rt $(NDD_RUNTIME) + +test_xdd_hsg1: + #megadd test_prep_ramdisk + #$(MAKE) -C $(BEE3HOME)/Workloads/xdd all + $(foreach THREADS, $(NDD_NUM_THREADS), \ + $(foreach RATIO, $(NDD_RATIOS), \ + $(foreach FILELEN, $(NDD_FILE_LENS_KB), \ + $(foreach REQSIZE, $(NDD_REQ_SIZE), \ + $(foreach SCRIPT, $(NDD_WRAPPER_SCRIPTS), \ + echo -n "Running test $@ on host " > $(NDD_OUT_DIR)/$(NDD_G1_DIR)/FS-$(NDD_LOG_FILENAME); hostname|awk '{ printf "%s ", $$0 }' >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/FS-$(NDD_LOG_FILENAME); echo -n "in directory " >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/FS-$(NDD_LOG_FILENAME); pwd >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/FS-$(NDD_LOG_FILENAME); ${\n}\ + echo "$(HSG1_FASTDD_CMD)" >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/FS-$(NDD_LOG_FILENAME) ; ${\n} \ + 2>&1 $(HSG1_FASTDD_CMD) >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/FS-$(NDD_LOG_FILENAME); sleep 10 ${\n} \ + ))))) + + +HSG2_CMD=$(NDD_AFFINITY_$(THREADS)) $(NDD_WRAPPER_DIR)/$(SCRIPT) $(NDDEXEC) $(NDDFLAGS) -queuedepth $(THREADS) -kbytes $(FILELEN) -rwratio $(RATIO) -blocksize $(REQSIZE) -seek range $$[($(FILELEN)*1024)/$(REQSIZE)] -target $(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) -startoffset 4096 + +HSG2_MEGADD_CMD=$(NDD_AFFINITY_$(THREADS)) $(NDD_WRAPPER_DIR)/$(SCRIPT) ./megadd $(NDD_RUNTIME) $(FILELEN) $(REQSIZE) $(RATIO) + +HSG2_FASTDD_CMD=$(NDD_AFFINITY_$(THREADS)) $(NDD_WRAPPER_DIR)/$(SCRIPT) ${BEE3HOME}/Workloads/fastdd/fastdd.exe asdf -footKB $(FILELEN) -r $(RATIO) -s $(REQSIZE) -tc $(THREADS) -file $(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) -rt $(NDD_RUNTIME) + + +test_xdd_hsg2: test_prep_ramdisk + #$(MAKE) -C $(BEE3HOME)/Workloads/xdd all + $(foreach THREADS, $(NDD_NUM_THREADS), \ + $(foreach RATIO, $(NDD_RATIOS), \ + $(foreach FILELEN, $(NDD_FILE_LENS_KB), \ + $(foreach REQSIZE, 4096, \ + $(foreach SCRIPT, $(NDD_WRAPPER_SCRIPTS), \ + echo -n "Running test $@ on host " > $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); hostname|awk '{ printf "%s ", $$0 }' >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); echo -n "in directory " >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); pwd >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); ${\n}\ + echo "$(HSG2_FASTDD_CMD)" 2>&1 >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); ${\n}\ + $(HSG2_FASTDD_CMD) 2>&1 >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); sleep 1 ${\n} \ + ))))) + +establish_mem_baseline: + -sudo rmmod memudisk.ko + /homes/leisner/bee3/Workloads/fastmm/fastmm.exe asdf -footKB 57671680 -r 100 -s 4096 -tc 16 -file /dev/memuram5 -rt 90 + /homes/leisner/bee3/Workloads/fastmm/fastmm.exe asdf -footKB 57671680 -r 0 -s 4096 -tc 16 -file /dev/memuram5 -rt 90 + sudo insmod ~leisner/bee3/Tools/KernelModules/Ramdisk/memudisk.ko && sleep 2 + sudo chmod 777 /dev/memuram5 + /homes/leisner/bee3/Workloads/fastdd/fastdd.exe asdf -footKB 57671680 -r 100 -s 4096 -tc 16 -file /dev/memuram5 -rt 90 + sudo rmmod memudisk.ko + sudo insmod ~leisner/bee3/Tools/KernelModules/Ramdisk/memudisk.ko && sleep 2 + sudo chmod 777 /dev/memuram5 + /homes/leisner/bee3/Workloads/fastdd/fastdd.exe asdf -footKB 57671680 -r 0 -s 4096 -tc 16 -file /dev/memuram5 -rt 90 + sudo rmmod memudisk.ko + +sweep_cores: + $(foreach CORE, 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15, taskset -c $(CORE) /homes/leisner/bee3/Workloads/fastmm/fastmm.exe asdf -footKB 4194304 -r 100 -s 4096 -tc 1 -file /dev/foochar -rt 10 2>&1 | grep asdf; ) + #$(foreach CORE, 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15, taskset -c $(CORE) /homes/leisner/bee3/Workloads/fastdd/fastdd.exe asdf -footKB 57671680 -r 100 -s 4096 -tc 8 -file /mnt/foodisk/zero.txt -rt 10 2>&1 | grep asdf; ) + +FOO_TARGET_DIR = /mnt/foodisk +FOO_TARGET_FILE = zero.txt +#FOO_TARGET_DIR = /tmp/memuram0 +#FOO_TARGET_FILE = xddtestfile.txt + + +# $(foreach THREADS, 1 2 4 8 12 16 24 32, \ +# $(foreach RATIO, 0 25 50 75 100, \ +# $(foreach FILELEN, 16 256 4096 65536 1048576 16777216 57671680, \ +# $(foreach REQSIZE, 8 128 512 4096 16384 65536 262144 1048576 4194304 16777216 511 513 4191 4193, \ + +# rm $(FOO_TARGET_DIR)/$(FOO_TARGET_FILE) ; \ +# dd if=/dev/zero of=$(FOO_TARGET_DIR)/$(FOO_TARGET_FILE) seek=0 count=1 bs=1 ; \ + +FOO_OUT_DIR = ~/results +FOO_G1_DIR=2012-06-07-02 +FOO_LOG_FILENAME = $(SCRIPT)-tc$(THREADS)-rw$(RATIO)-filelen$(FILELEN)k-reqsize$(REQSIZE).result + +FOO_WRAPPER_DIR = $(BEE3HOME)/Tools/PosixNVM/bin +FOOEXEC = $(BEE3HOME)/Workloads/xdd/bin/xdd.linux + +FOO_RUNTIME = 40 + +FOO_G1_CMD=$(NDD_AFFINITY_$(THREADS)) $(FOO_WRAPPER_DIR)/$(SCRIPT) ${BEE3HOME}/Workloads/fastdd/fastdd.exe asdf -footKB $(FILELEN) -r $(RATIO) -s $(REQSIZE) -tc $(THREADS) -file $(FOO_TARGET_DIR)/$(FOO_TARGET_FILE) -rt $(FOO_RUNTIME) + +test_foodisk_1: + mkdir $(FOO_OUT_DIR)/$(FOO_G1_DIR) + #$(MAKE) -C foomodule all prepd + @echo `date` ": Start of test $@" >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/log.txt + $(foreach THREADS, 1 2 4 8 12 16, \ + $(foreach RATIO, 0 50 100, \ + $(foreach FILELEN, 256 4096 65536 1048576 16777216 57671680, \ + $(foreach REQSIZE, 8 512 4096 16384, \ + $(foreach SCRIPT, nvpUnmod nvpNVP, \ + echo -n "Running test $@ on host " > $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_LOG_FILENAME); hostname|awk '{ printf "%s ", $$0 }' >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_LOG_FILENAME); echo -n "in directory " >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_LOG_FILENAME); pwd >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_LOG_FILENAME); ${\n}\ + echo "$(FOO_G1_CMD)" >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_LOG_FILENAME) ; ${\n} \ + @echo `date` ": Start of test " $(FOO_LOG_FILENAME) >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/log.txt ; ${\n} \ + 2>&1 $(FOO_G1_CMD) >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_LOG_FILENAME); sleep 10 ${\n} \ + ))))) + @echo `date` ": End of test $@" >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/log.txt + +FOO_G2_CMD=$(NDD_AFFINITY_$(THREADS)) $(FOO_WRAPPER_DIR)/$(SCRIPT) ${BEE3HOME}/Workloads/fastmeta/fastmeta.exe asdf -footKB $(FILELEN) -r $(RATIO) -s $(REQSIZE) -tc $(THREADS) -file $(FOO_TARGET_DIR)/meta -F $(FILECOUNT) -rt $(FOO_RUNTIME) +FOO_G2_LOG_FILENAME = $(SCRIPT)-tc$(THREADS)-rw$(RATIO)-filelen$(FILELEN)k-reqsize$(REQSIZE)-filecount$(FILECOUNT).result + +test_foodisk_2: + mkdir $(FOO_OUT_DIR)/$(FOO_G1_DIR) + #$(MAKE) -C foomodule all prepd + @echo `date` ": Start of test $@" >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/log.txt + $(foreach THREADS, 1 4 12, \ + $(foreach RATIO, 0 50, \ + $(foreach FILELEN, 57671680, \ + $(foreach FILECOUNT, 1 4 16 64, \ + $(foreach REQSIZE, 8 512 4192 65536 1048576 16777216, \ + $(foreach SCRIPT, nvpUnmod nvpNVP, \ + rm -rf $(FOO_TARGET_DIR)/meta ; mkdir $(FOO_TARGET_DIR)/meta ; \ + echo -n "Running test $@ on host " > $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_G2_LOG_FILENAME); hostname|awk '{ printf "%s ", $$0 }' >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_G2_LOG_FILENAME); echo -n "in directory " >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_G2_LOG_FILENAME); pwd >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_G2_LOG_FILENAME); ${\n}\ + echo "$(FOO_G2_CMD)" >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_G2_LOG_FILENAME) ; ${\n} \ + @echo `date` ": Start of test " $(FOO_G2_LOG_FILENAME) >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/log.txt ; ${\n} \ + 2>&1 $(FOO_G2_CMD) >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/FS-$(FOO_G2_LOG_FILENAME); sleep 10 ${\n} \ + )))))) + @echo `date` ": End of test $@" >> $(FOO_OUT_DIR)/$(FOO_G1_DIR)/log.txt + + +#test_compare_dd: test_prep_ramdisk +# $(MAKE) -C $(BEE3HOME)/Workloads/xdd all +# $(MAKE) -C $(BEE3HOME)/Workloads/fastdd clean all +# $(CC) megadd.c -o megadd $(CFLAGS) +# $(BEE3HOME)/Workloads/fastdd/fastdd.exe asdf -rt 600 -file /tmp/memuram0/xddtestfile.txt -foot $[57671680/1024/1024] -r 100 -s 4096 2&>1 > /x/HotStorage2011-NVP/logs/comparedd/fastdd.result +# $(BEE3HOME)/Workloads/xdd/bin/xdd.linux -noproclock -nomemlock -reqsize 1 -dio -seek random -target /tmp/memuram0/xddtestfile.txt -runtime 600 -queuedepth 1 -kbytes 57671680 -rwratio 100 -blocksize 4096 -seek range $[57671680/4096] 2&>1 > /x/HotStorage2011-NVP/logs/comparedd/xdd.result +# ./megadd 2&>1 > /x/HotStorage2011-NVP/logs/comparedd/megadd.result + + +BDB_TARGET_DIR = /tmp/memuram0 + +BDB_WRAPPER_DIR = $(BEE3HOME)/Tools/PosixNVM/bin +BDB_WRAPPER_SCRIPTS = nvpNVP nvpUnmod +BDB_NUM_THREADS = 1 2 4 8 16 +BDB_LOG_FILENAME = BDB-$(STRUCT)-$(SCRIPT)-tc$(THREADS).result + +BDB_TREE_EXEC = $(BEE3HOME)/Workloads/BDB/Multi/BTree.exe +BDB_HASH_EXEC = $(BEE3HOME)/Workloads/BDB/Multi/HashTable.exe + +BDB_STORAGE_STRUCTS = TREE HASH + +BDB_FLAGS = -footMB 16384 -reload -file $(BDB_TARGET_DIR)/$(BDB_$(STRUCT)_DIR) -F -rt 600 + +BDB_TREE_DIR = BTree +BDB_HASH_DIR = HashTable + +BDB_OUT_DIR = /x/FAST2012-NVP/data/bdb/01 + +BDB_CMD = $(BDB_WRAPPER_DIR)/$(SCRIPT) $(BDB_$(STRUCT)_EXEC) bdb-$(STRUCT)-$(SCRIPT) $(BDB_FLAGS) -tc $(THREADS) + +test_bdb_hsg3: test_prep_ramdisk + #$(MAKE) -C$(BEE3HOME)/Workloads/BDB/Multi all + -rm $(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) + $(foreach SCRIPT, $(BDB_WRAPPER_SCRIPTS), \ + $(foreach THREADS, $(BDB_NUM_THREADS), \ + $(foreach STRUCT, $(BDB_STORAGE_STRUCTS), \ + rm -rf $(BDB_TARGET_DIR)/$(BDB_$(STRUCT)_DIR) ; \ + cp -r /x/SC2010/benchmark_inputs/bdb/$(BDB_$(STRUCT)_DIR) $(BDB_TARGET_DIR)/$(BDB_$(STRUCT)_DIR) ${\n} \ + echo -n "Running test $@ on host " > $(BDB_OUT_DIR)/$(BDB_LOG_FILENAME); hostname|awk '{ printf "%s ", $$0 }' >> $(BDB_OUT_DIR)/$(BDB_LOG_FILENAME); echo -n "in directory " >> $(BDB_OUT_DIR)/$(BDB_LOG_FILENAME); pwd >> $(BDB_OUT_DIR)/$(BDB_LOG_FILENAME); ${\n}\ + echo "$(BDB_CMD)" >> $(BDB_OUT_DIR)/$(BDB_LOG_FILENAME) ; ${\n} \ + $(BDB_CMD) 2&>1 >> $(BDB_OUT_DIR)/$(BDB_LOG_FILENAME) ; \ + sleep 1 ${\n} \ + ))) + + +HSG2_ALT_CMD=$(NDD_AFFINITY) $(NDD_WRAPPER_DIR)/$(SCRIPT) $(NDDEXEC) $(NDDFLAGS) -queuedepth $(THREADS) -kbytes $(FILELEN) -rwratio $(RATIO) -blocksize $(REQSIZE) -seek range $$[($(FILELEN)*1024)/$(REQSIZE)] -target $(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) 2>&1 >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME) + +test_xdd_hsg2_alt: + #$(MAKE) -C $(BEE3HOME)/Workloads/xdd all + $(foreach THREADS, 1, \ + $(foreach RATIO, $(NDD_RATIOS), \ + $(foreach FILELEN, $(NDD_FILE_LENS_KB), \ + $(foreach REQSIZE, 4096, \ + $(foreach SCRIPT, $(NDD_WRAPPER_SCRIPTS), \ + -sudo umount /tmp/memuram0; ${\n} \ + -sudo rmmod memudisk; ${\n} \ + sudo insmod $(BEE3HOME)/Tools/KernelModules/Ramdisk/memudisk.ko rd_size=$$[64*1024*1024] max_part=1 rd_nr=1 ; \ + sleep 2 ; ${\n} \ + sudo chmod 777 /dev/memuram0; ${\n} \ + sudo mke2fs /dev/memuram0 -b 4096 ; ${\n} \ + sudo mount /dev/memuram0 /tmp/memuram0 -o xip ; ${\n} \ + sudo chmod 777 /tmp/memuram0 ; ${\n} \ + dd if=/dev/zero of=$(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) bs=$$[1024*1024] count=$$[57671680/1024] oflag=direct; ${\n} \ + rm $(NDD_TARGET_DIR)/$(NDD_TARGET_FILE); ${\n} \ + -dd if=/dev/zero of=$(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) bs=4096 count=$$[$(FILELEN)/4] oflag=direct ; \ + sleep 2 ; ${\n} \ + echo -n "Running test $@ on host " > $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); hostname|awk '{ printf "%s ", $$0 }' >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); echo -n "in directory " >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); pwd >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); ${\n}\ + echo "$(HSG2_ALT_CMD)" >> $(NDD_OUT_DIR)/$(NDD_G2_DIR)/$(NDD_LOG_FILENAME); ${\n} \ + $(HSG2_ALT_CMD); sleep 10 ${\n} \ + ))))) + +test_xdd_hsg1_dev: test_prep_ramdisk + sudo umount /tmp/memuram0 + $(foreach THREADS, $(NDD_NUM_THREADS), \ + $(foreach RATIO, $(NDD_RATIOS), \ + $(foreach FILELEN, 57671680, \ + $(foreach REQSIZE, $(NDD_REQ_SIZE), \ + $(foreach SCRIPT, $(NDD_WRAPPER_SCRIPTS), \ + echo -n "Running test $@ on host " > $(NDD_OUT_DIR)/$(NDD_G1_DIR)/dev-$(NDD_LOG_FILENAME); hostname|awk '{ printf "%s ", $$0 }' >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/dev-$(NDD_LOG_FILENAME); echo -n "in directory " >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/dev-$(NDD_LOG_FILENAME); pwd >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/dev-$(NDD_LOG_FILENAME); ${\n}\ + $(NDD_AFFINITY) $(NDD_WRAPPER_DIR)/$(SCRIPT) $(NDDEXEC) $(NDDFLAGS) -queuedepth $(THREADS) -kbytes $(FILELEN) -rwratio $(RATIO) -blocksize $(REQSIZE) -seek range $$[($(FILELEN)*1024)/$(REQSIZE)] -target /dev/memuram0 2>&1 >> $(NDD_OUT_DIR)/$(NDD_G1_DIR)/dev-$(NDD_LOG_FILENAME); sleep 1 ${\n} \ + ))))) + +#grep -H "Combined" $(NDD_OUT_DIR)/*.result > $(NDD_OUT_DIR)/summary.result + + + +OLTP_TARGET_DIR = /mnt/foodisk + +OLTP_WRAPPER_DIR = $(BEE3HOME)/Tools/PosixNVM/bin +OLTP_WRAPPER_SCRIPTS = nvpNVP nvpUnmod +OLTP_NUM_THREADS = 1 16 +OLTP_LOG_FILENAME = OLTP-$(SCRIPT)-tc$(THREADS).result + +OLTP_DEFAULTS=${BEE3HOME}/Tools/PosixNVM/oltp-config-foo.cnf + +OLTP_TIME=60 + +OLTP_OUT_DIR = ~/results/2012-05-01-01 + +#OLTP_REDIRECT= 2&>1 >> $(OLTP_OUT_DIR)/$(OLTP_LOG_FILENAME) +OLTP_REDIRECT = + +OLTP_CMD = $(OLTP_WRAPPER_DIR)/$(SCRIPT) $(OLTP_$(STRUCT)_EXEC) bdb-$(STRUCT)-$(SCRIPT) $(OLTP_FLAGS) -tc $(THREADS) + +#test_oltp_hsg3: test_prep_ramdisk +test_oltp_hsg3: + rm -f $(NDD_TARGET_DIR)/$(NDD_TARGET_FILE) + $(foreach SCRIPT, $(OLTP_WRAPPER_SCRIPTS), \ + $(foreach THREADS, $(OLTP_NUM_THREADS), \ + echo -n "Running test $@ on host " > $(OLTP_OUT_DIR)/$(OLTP_LOG_FILENAME); hostname|awk '{ printf "%s ", $$0 }' >> $(OLTP_OUT_DIR)/$(OLTP_LOG_FILENAME); echo -n "in directory " >> $(OLTP_OUT_DIR)/$(OLTP_LOG_FILENAME); pwd >> $(OLTP_OUT_DIR)/$(OLTP_LOG_FILENAME); ${\n}\ + rm -rf $(OLTP_TARGET_DIR)/mysql; mkdir $(OLTP_TARGET_DIR)/mysql; sleep 1; ${\n} \ + mysqld_safe --defaults-file=$(OLTP_DEFAULTS); sleep 1; ${\n} \ + mysql_install_db --basedir=${BEE3HOME}/ext/mysql-5.1.46/install --datadir=$(OLTP_TARGET_DIR)/mysql --defaults-file=$(OLTP_DEFAULTS) $(OLTP_REDIRECT) ; ${\n}\ + cd ${BEE3HOME}/ext/mysql-5.1.46/install; ./bin/mysqld_safe_moneta --defaults-file=$(OLTP_DEFAULTS) --nvp-preloads='$(OLTP_WRAPPER_DIR)/$(SCRIPT)' $(OLTP_REDIRECT) & ${\n}\ + sleep 180; ${\n}\ + mysql -u root < ${BEE3HOME}/Automate/SC2010/scripts/util/oltp_init.sql $(OLTP_REDIRECT); ${\n}\ + $(BEE3HOME)/ext/sysbench-0.4.12/sysbench/sysbench --test=oltp --db-driver=mysql --mysql-table-engine=innodb --mysql-user=root --mysql-password= --mysql-socket=/tmp/mysql.sock --oltp-table-size=32000000 prepare $(OLTP_REDIRECT) ; ${\n}\ + sleep 80 ; ${\n}\ + $(BEE3HOME)/ext/sysbench-0.4.12/install/bin/sysbench --num-threads=$(THREADS) --max-time=$(OLTP_TIME) --max-requests=0 --test=oltp --oltp-table-size=32000000 --db-driver=mysql --mysql-table-engine=innodb --mysql-user=root --mysql-password= --mysql-socket=/tmp/mysql.sock run $(OLTP_REDIRECT) ; ${\n}\ + sleep 10 ; \ + mysqladmin --defaults-file=/tmp/my.cnf -u root shutdown $(OLTP_REDIRECT) ; \ + sleep 10 ; ${\n}\ + )) + diff --git a/splitfs_syscall_intercept/src/mkdir.c b/splitfs_syscall_intercept/src/mkdir.c new file mode 100644 index 0000000000..b556ddb6ed --- /dev/null +++ b/splitfs_syscall_intercept/src/mkdir.c @@ -0,0 +1,93 @@ +#include +#include + +#include "timers.h" +#include "log.h" + +RETT_SYSCALL_INTERCEPT _sfs_MKDIR(INTF_SYSCALL) +{ + DEBUG_FILE("CALL: %s\n", __func__); + instrumentation_type op_log_entry_time; + + char *path; + mode_t mode; + + path = (char *)arg0; + mode = (mode_t)arg1; + + // Write to op log + *result = syscall_no_intercept(SYS_mkdir, path, mode); + DEBUG_FILE("%s: System call returned %d. Logging\n", __func__, result); + +#if !POSIX_ENABLED + if(*result == 0) { + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_DIR_CREATE, + path, + NULL, + mode, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); + } +#endif + return RETT_NO_PASS_KERN; +} + +RETT_SYSCALL_INTERCEPT _sfs_MKDIRAT(INTF_SYSCALL) { + DEBUG_FILE("CALL: %s\n", __func__); + instrumentation_type op_log_entry_time; + int dirfd, mode; + char *path; + + dirfd = (int)arg0; + path = (char *)arg1; + mode = (int)arg2; + + *result = syscall_no_intercept(SYS_mkdirat, dirfd, path, mode); + + // Write to op log + +#if !POSIX_ENABLED + char new_path[256]; + int path_len = 0; + if (dirfd == AT_FDCWD) { + if (path[0] != '/') { + if (getcwd(new_path, sizeof(new_path)) == NULL) + assert(0); + path_len = strlen(new_path); + new_path[path_len] = '/'; + new_path[path_len+1] = '\0'; + + if (strcat(new_path, path) != new_path) + assert(0); + } else { + if (strcpy(new_path, path) == NULL) + assert(0); + } + } else { + char fd_str[256]; + if (path[0] != '/') { + sprintf(fd_str, "/proc/self/fd/%d", dirfd); + if (readlink(fd_str, new_path, sizeof(new_path)) == -1) + assert(0); + path_len = strlen(new_path); + new_path[path_len] = '/'; + new_path[path_len+1] = '\0'; + if (strcat(new_path, path) != new_path) + assert(0); + } else { + if (strcpy(new_path, path) == NULL) + assert(0); + } + } + + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_DIR_CREATE, + new_path, + NULL, + mode, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); +#endif + return RETT_NO_PASS_KERN; +} diff --git a/splitfs_syscall_intercept/src/mknod.c b/splitfs_syscall_intercept/src/mknod.c new file mode 100644 index 0000000000..455c7e64cb --- /dev/null +++ b/splitfs_syscall_intercept/src/mknod.c @@ -0,0 +1,95 @@ +#include +#include + +#include "timers.h" +#include "log.h" + +// Makes the call synchronous in case of 'strict' and 'sync' mode +RETT_SYSCALL_INTERCEPT _sfs_MKNOD(INTF_SYSCALL) { + instrumentation_type op_log_entry_time; + char *path; + mode_t mode; + dev_t dev; + + path = (char *)arg0; + mode = (mode_t)arg1; + dev = (dev_t)arg2; + + *result = syscall_no_intercept(SYS_mknod, path, mode, dev); + +#if !POSIX_ENABLED + if (S_ISREG(mode)) { + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_FILE_CREATE, + path, + NULL, + mode, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); + } +#endif + + return RETT_NO_PASS_KERN; +} + +RETT_SYSCALL_INTERCEPT _sfs_MKNODAT(INTF_SYSCALL) { + int dirfd; + char *path; + mode_t mode; + dev_t dev; + + dirfd = (int)arg0; + path = (char *)arg1; + mode = (mode_t)arg2; + dev = (dev_t)arg3; + + *result = syscall_no_intercept(SYS_mknodat, dirfd, path, mode, dev); + + char new_path[256]; + int path_len = 0; + instrumentation_type op_log_entry_time; + + if (S_ISREG(mode)) { + if (dirfd == AT_FDCWD) { + if (path[0] != '/') { + if (getcwd(new_path, sizeof(new_path)) == NULL) + assert(0); + path_len = strlen(new_path); + new_path[path_len] = '/'; + new_path[path_len+1] = '\0'; + + if (strcat(new_path, path) != new_path) + assert(0); + } else { + if (strcpy(new_path, path) == NULL) + assert(0); + } + } else { + char fd_str[256]; + if (path[0] != '/') { + sprintf(fd_str, "/proc/self/fd/%d", dirfd); + if (readlink(fd_str, new_path, sizeof(new_path)) == -1) + assert(0); + path_len = strlen(new_path); + new_path[path_len] = '/'; + new_path[path_len+1] = '\0'; + if (strcat(new_path, path) != new_path) + assert(0); + } else { + if (strcpy(new_path, path) == NULL) + assert(0); + } + } + } + +#if !POSIX_ENABLED + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_FILE_CREATE, + new_path, + NULL, + mode, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); +#endif + return RETT_NO_PASS_KERN; +} \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/mmap_cache.c b/splitfs_syscall_intercept/src/mmap_cache.c new file mode 100644 index 0000000000..c67a50569b --- /dev/null +++ b/splitfs_syscall_intercept/src/mmap_cache.c @@ -0,0 +1,125 @@ +/* + * ===================================================================================== + * + * Filename: mmap_cache.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:46:02 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#include +#include "mmap_cache.h" +#include "handle_mmaps.h" + +void nvp_add_to_inode_mapping(struct NVNode *node, ino_t serialno) +{ + struct InodeToMapping *mappingToBeAdded; + + int index = serialno % 1024; + int i, dirty_index; + + if (serialno == 0) + return; + + DEBUG("Cleanup: root 0x%x, height %u\n", root, height); + mappingToBeAdded = &_nvp_ino_mapping[index]; + if(mappingToBeAdded->serialno != 0 && mappingToBeAdded->serialno != serialno) { + // Replacing some mmap() in that global mmap() cache. So must munmap() all the mmap() ranges in that cache. + nvp_free_btree(mappingToBeAdded->root, mappingToBeAdded->merkle_root, mappingToBeAdded->height, mappingToBeAdded->root_dirty_cache, mappingToBeAdded->root_dirty_num, mappingToBeAdded->total_dirty_mmaps); + + mappingToBeAdded->serialno = 0; + } + + // Check if many mmap()s need to be copied. If total_dirty_mmaps is set, that means all the mmap()s need to be copied. + if(node->total_dirty_mmaps) { + memcpy(mappingToBeAdded->root, node->root, 1024 * sizeof(unsigned long)); + memcpy(mappingToBeAdded->merkle_root, node->merkle_root, 1024 * sizeof(unsigned long)); + + } else { + // Only copy the dirty mmaps. The indexes can be found in the root_dirty_cache. + for(i = 0; i < node->root_dirty_num; i++) { + dirty_index = node->root_dirty_cache[i]; + if(node->root && node->root[dirty_index]) + mappingToBeAdded->root[dirty_index] = node->root[dirty_index]; + + if(node->merkle_root && node->merkle_root[dirty_index]) + mappingToBeAdded->merkle_root[dirty_index] = node->merkle_root[dirty_index]; + } + } + + mappingToBeAdded->serialno = serialno; + + if(node->root_dirty_num) + memcpy(mappingToBeAdded->root_dirty_cache, node->root_dirty_cache, 20 * sizeof(unsigned long)); + + mappingToBeAdded->root_dirty_num = node->root_dirty_num; + mappingToBeAdded->total_dirty_mmaps = node->total_dirty_mmaps; + mappingToBeAdded->height = node->height; +} + +/* + * This function is responsible for copying all the mapping from the global mmap() cache on to the mmap tree of the node. + */ +int nvp_retrieve_inode_mapping(struct NVNode *node) { + + struct InodeToMapping *mappingToBeRetrieved; + int index = node->serialno % 1024; + int dirty_index, i; + + DEBUG("Cleanup: root 0x%x, height %u\n", root, height); + + /* + * Get the mapping from the global mmap() cache, based on the inode number of the node whose mapping it should + * be retrieved from. + */ + mappingToBeRetrieved = &_nvp_ino_mapping[index]; + + if(mappingToBeRetrieved->serialno == node->serialno) { + + /* + * Copy the file backed mmap()s and the merkle roots. total_dirty_mmaps suggests that there are more than + * 20 mmaps that need to be copied. + */ + if(mappingToBeRetrieved->total_dirty_mmaps) { + memcpy(node->root, mappingToBeRetrieved->root, 1024 * sizeof(unsigned long)); + memcpy(node->merkle_root, mappingToBeRetrieved->merkle_root, 1024 * sizeof(unsigned long)); + + } else { + + for(i = 0; i < mappingToBeRetrieved->root_dirty_num; i++) { + dirty_index = mappingToBeRetrieved->root_dirty_cache[i]; + if(mappingToBeRetrieved->root && mappingToBeRetrieved->root[dirty_index]) + node->root[dirty_index] = mappingToBeRetrieved->root[dirty_index]; + + if(mappingToBeRetrieved->merkle_root && mappingToBeRetrieved->merkle_root[dirty_index]) + node->merkle_root[dirty_index] = mappingToBeRetrieved->merkle_root[dirty_index]; + } + } + + // Copy the root_dirty_cache from the global mmap() cache on to the node mmap() cache + //if(mappingToBeRetrieved->root_dirty_num) + memcpy(node->root_dirty_cache, mappingToBeRetrieved->root_dirty_cache, 20 * sizeof(unsigned long)); + + node->root_dirty_num = mappingToBeRetrieved->root_dirty_num; + node->total_dirty_mmaps = mappingToBeRetrieved->total_dirty_mmaps; + node->height = mappingToBeRetrieved->height; + + //printf("%s: end: node->root[0] = %lu, mapping root = %lu, mapping root dirty num = %d, node->serialno = %lu, index = %d, node reference = %d, thread_id = %lu\n", __func__, node->root[0], mappingToBeRetrieved->root[0], mappingToBeRetrieved->root_dirty_num, node->serialno, index, node->reference, pthread_self()); + + goto out; + } + + return -1; + out: + return 0; +} + + diff --git a/splitfs_syscall_intercept/src/mmap_cache.h b/splitfs_syscall_intercept/src/mmap_cache.h new file mode 100644 index 0000000000..e4ff0b8f57 --- /dev/null +++ b/splitfs_syscall_intercept/src/mmap_cache.h @@ -0,0 +1,43 @@ +/* + * ===================================================================================== + * + * Filename: mmap_cache.h + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:58:38 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#ifndef SPLITFS_MMAP_CACHE_H +#define SPLITFS_MMAP_CACHE + +#include +#include "inode.h" + +struct InodeToMapping +{ + ino_t serialno; + unsigned long *root; + unsigned long *merkle_root; + unsigned long *root_dirty_cache; + int root_dirty_num; + int total_dirty_mmaps; + unsigned int height; + char buffer[16]; +}; + +extern struct InodeToMapping* _nvp_ino_mapping; + +#define MMAP_CACHE_ENTRIES 1024 + +void nvp_add_to_inode_mapping(struct NVNode *node, ino_t serialno); +int nvp_retrieve_inode_mapping(struct NVNode *node); + +#endif diff --git a/splitfs_syscall_intercept/src/non_temporal.c b/splitfs_syscall_intercept/src/non_temporal.c new file mode 100644 index 0000000000..d9c89c5760 --- /dev/null +++ b/splitfs_syscall_intercept/src/non_temporal.c @@ -0,0 +1,233 @@ +#include "non_temporal.h" +#include "debug.h" + +static size_t Movnt_threshold_granularity = MOVNT_THRESHOLD_GRANULARITY; + +#if 0 +static void +predrain_memory_barrier(void) +{ + _mm_mfence(); /* ensure CLWB or CLFLUSHOPT completes */ +} +#endif + +static void +flush_dcache_invalidate_opt(const void *addr, size_t len) +{ + uintptr_t uptr; + + /* + * Loop through cache-line-size (typically 64B) aligned chunks + * covering the given range. + */ + for (uptr = (uintptr_t)addr & ~(FLUSH_ALIGN - 1); + uptr < (uintptr_t)addr + len; uptr += FLUSH_ALIGN) { + _mm_flush((char *)uptr); + } +} + + +/* + * pmem_flush -- flush processor cache for the given range + */ +static void +pmem_flush(const void *addr, size_t len) +{ + flush_dcache_invalidate_opt(addr, len); +} + + +void *memmove_nodrain_movnt_granularity(void *pmemdest, const void *src, size_t len) +{ + __m128i xmm0, xmm1, xmm2, xmm3; + size_t i; + __m128i *d; + __m128i *s; + void *dest1 = pmemdest; + size_t cnt; + + //predrain_memory_barrier(); + + if (len == 0 || src == pmemdest) + return pmemdest; + + if (len < Movnt_threshold_granularity) { + memmove(pmemdest, src, len); + pmem_flush(pmemdest, len); + return pmemdest; + } + + if ((uintptr_t)dest1 - (uintptr_t)src >= len) { + /* + * Copy the range in the forward direction. + * + * This is the most common, most optimized case, used unless + * the overlap specifically prevents it. + */ + /* copy up to FLUSH_ALIGN boundary */ + cnt = (uint64_t)dest1 & ALIGN_MASK; + if (cnt > 0) { + cnt = FLUSH_ALIGN - cnt; + + /* never try to copy more the len bytes */ + if (cnt > len) + cnt = len; + + uint8_t *d8 = (uint8_t *)dest1; + const uint8_t *s8 = (uint8_t *)src; + for (i = 0; i < cnt; i++) { + *d8 = *s8; + d8++; + s8++; + } + pmem_flush(dest1, cnt); + dest1 = (char *)dest1 + cnt; + src = (char *)src + cnt; + len -= cnt; + } + + d = (__m128i *)dest1; + s = (__m128i *)src; + + cnt = len >> CHUNK_SHIFT_GRANULARITY; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s); + xmm1 = _mm_loadu_si128(s + 1); + xmm2 = _mm_loadu_si128(s + 2); + xmm3 = _mm_loadu_si128(s + 3); + s += 4; + _mm_stream_si128(d, xmm0); + _mm_stream_si128(d + 1, xmm1); + _mm_stream_si128(d + 2, xmm2); + _mm_stream_si128(d + 3, xmm3); + d += 4; + } + + /* copy the tail (<128 bytes) in 16 bytes chunks */ + len &= CHUNK_MASK_GRANULARITY; + if (len != 0) { + cnt = len >> MOVNT_SHIFT; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s); + _mm_stream_si128(d, xmm0); + s++; + d++; + } + } + len &= MOVNT_MASK; + if (len != 0) { + cnt = len >> DWORD_SHIFT; + int32_t *d32 = (int32_t *)d; + int32_t *s32 = (int32_t *)s; + for (i = 0; i < cnt; i++) { + _mm_stream_si32(d32, *s32); + d32++; + s32++; + } + cnt = len & DWORD_MASK; + uint8_t *d8 = (uint8_t *)d32; + const uint8_t *s8 = (uint8_t *)s32; + + for (i = 0; i < cnt; i++) { + *d8 = *s8; + d8++; + s8++; + } + pmem_flush(d32, cnt); + + /* copy the last bytes (<16), first dwords then bytes */ + + } + } else { + /* + * Copy the range in the backward direction. + * + * This prevents overwriting source data due to an + * overlapped destination range. + */ + + dest1 = (char *)dest1 + len; + src = (char *)src + len; + + cnt = (uint64_t)dest1 & ALIGN_MASK; + if (cnt > 0) { + /* never try to copy more the len bytes */ + if (cnt > len) + cnt = len; + + uint8_t *d8 = (uint8_t *)dest1; + const uint8_t *s8 = (uint8_t *)src; + for (i = 0; i < cnt; i++) { + d8--; + s8--; + *d8 = *s8; + } + pmem_flush(d8, cnt); + dest1 = (char *)dest1 - cnt; + src = (char *)src - cnt; + len -= cnt; + } + + d = (__m128i *)dest1; + s = (__m128i *)src; + + cnt = len >> CHUNK_SHIFT_GRANULARITY; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s - 1); + xmm1 = _mm_loadu_si128(s - 2); + xmm2 = _mm_loadu_si128(s - 3); + _mm_stream_si128(d - 4, xmm3); + d -= 4; + } + + /* copy the tail (<128 bytes) in 16 bytes chunks */ + len &= CHUNK_MASK_GRANULARITY; + if (len != 0) { + cnt = len >> MOVNT_SHIFT; + for (i = 0; i < cnt; i++) { + d--; + s--; + xmm0 = _mm_loadu_si128(s); + _mm_stream_si128(d, xmm0); + } + } + + /* copy the last bytes (<16), first dwords then bytes */ + len &= MOVNT_MASK; + if (len != 0) { + cnt = len >> DWORD_SHIFT; + int32_t *d32 = (int32_t *)d; + int32_t *s32 = (int32_t *)s; + for (i = 0; i < cnt; i++) { + d32--; + s32--; + _mm_stream_si32(d32, *s32); + } + + cnt = len & DWORD_MASK; + uint8_t *d8 = (uint8_t *)d32; + const uint8_t *s8 = (uint8_t *)s32; + + for (i = 0; i < cnt; i++) { + d8--; + s8--; + *d8 = *s8; + } + pmem_flush(d8, cnt); + } + } + + /* + * The call to pmem_*_nodrain() should be followed by pmem_drain() + * to serialize non-temporal store instructions. (It could be only + * one drain after a sequence of pmem_*_nodrain calls). + * However, on platforms that only support strongly-ordered CLFLUSH + * for flushing the CPU cache (or that are forced to not use + * CLWB/CLFLUSHOPT) there is no need to put any memory barrier after + * the flush, so the pmem_drain() is a no-op function. In such case, + * we need to put a memory barrier here. + */ + //predrain_memory_barrier(); + + return pmemdest; +} diff --git a/splitfs_syscall_intercept/src/non_temporal.h b/splitfs_syscall_intercept/src/non_temporal.h new file mode 100644 index 0000000000..4438f774ca --- /dev/null +++ b/splitfs_syscall_intercept/src/non_temporal.h @@ -0,0 +1,589 @@ +#ifndef __LEDGER_NON_TEMPORAL_H_ +#define __LEDGER_NON_TEMPORAL_H_ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CHUNK_SIZE_GRANULARITY 64 +#define CHUNK_SIZE 128 /* 16*8 */ +#define CHUNK_SHIFT 7 +#define CHUNK_SHIFT_GRANULARITY 6 +#define CHUNK_MASK (CHUNK_SIZE - 1) +#define CHUNK_MASK_GRANULARITY (CHUNK_SIZE_GRANULARITY - 1) + +#define DWORD_SIZE 4 +#define DWORD_SHIFT 2 +#define DWORD_MASK (DWORD_SIZE - 1) + +#define MOVNT_SIZE 16 +#define MOVNT_MASK (MOVNT_SIZE - 1) +#define MOVNT_SHIFT 4 +#define FLUSH_ALIGN ((uintptr_t)64) +#define ALIGN_MASK (FLUSH_ALIGN - 1) + +#define MOVNT_THRESHOLD 256 +#define MOVNT_THRESHOLD_GRANULARITY 64 + +#define CLFLUSH_SIZE 64 +#define _mm_clflushopt(addr)\ + asm volatile("clflushopt %0" : "+m" (*(volatile char *)(addr))) + +/* This will point to the right function during startup of + splitfs (_nvp_init2) */ +void (*_mm_flush)(void const* p); + + +void *memmove_nodrain_movnt_granularity(void *pmemdest, const void *src, size_t len); + +#if 0 + +static size_t Movnt_threshold = MOVNT_THRESHOLD; + +static void +predrain_memory_barrier(void) +{ + _mm_mfence(); /* ensure CLWB or CLFLUSHOPT completes */ +} + + +static void +flush_dcache_invalidate_opt(const void *addr, size_t len) +{ + uintptr_t uptr; + + for (uptr = (uintptr_t)addr & ~(FLUSH_ALIGN - 1); + uptr < (uintptr_t)addr + len; uptr += FLUSH_ALIGN) { + _mm_clflushopt((char *)uptr); + } +} +*/ + +/* + * pmem_flush -- flush processor cache for the given range + */ + +static void +pmem_flush(const void *addr, size_t len) +{ + flush_dcache_invalidate_opt(addr, len); +} + + +static void * +memmove_nodrain_movnt_granularity(void *pmemdest, const void *src, size_t len) +{ + __m128i xmm0, xmm1, xmm2, xmm3; + size_t i; + __m128i *d; + __m128i *s; + void *dest1 = pmemdest; + size_t cnt; + + //predrain_memory_barrier(); + + if (len == 0 || src == pmemdest) + return pmemdest; + + if (len < Movnt_threshold_granularity) { + memmove(pmemdest, src, len); + pmem_flush(pmemdest, len); + return pmemdest; + } + + if ((uintptr_t)dest1 - (uintptr_t)src >= len) { + /* + * Copy the range in the forward direction. + * + * This is the most common, most optimized case, used unless + * the overlap specifically prevents it. + */ + /* copy up to FLUSH_ALIGN boundary */ + cnt = (uint64_t)dest1 & ALIGN_MASK; + if (cnt > 0) { + cnt = FLUSH_ALIGN - cnt; + + /* never try to copy more the len bytes */ + if (cnt > len) + cnt = len; + + uint8_t *d8 = (uint8_t *)dest1; + const uint8_t *s8 = (uint8_t *)src; + for (i = 0; i < cnt; i++) { + *d8 = *s8; + d8++; + s8++; + } + pmem_flush(dest1, cnt); + dest1 = (char *)dest1 + cnt; + src = (char *)src + cnt; + len -= cnt; + } + + d = (__m128i *)dest1; + s = (__m128i *)src; + + cnt = len >> CHUNK_SHIFT_GRANULARITY; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s); + xmm1 = _mm_loadu_si128(s + 1); + xmm2 = _mm_loadu_si128(s + 2); + xmm3 = _mm_loadu_si128(s + 3); + s += 4; + _mm_stream_si128(d, xmm0); + _mm_stream_si128(d + 1, xmm1); + _mm_stream_si128(d + 2, xmm2); + _mm_stream_si128(d + 3, xmm3); + d += 4; + } + + /* copy the tail (<128 bytes) in 16 bytes chunks */ + len &= CHUNK_MASK_GRANULARITY; + if (len != 0) { + cnt = len >> MOVNT_SHIFT; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s); + _mm_stream_si128(d, xmm0); + s++; + d++; + } + } + len &= MOVNT_MASK; + if (len != 0) { + cnt = len >> DWORD_SHIFT; + int32_t *d32 = (int32_t *)d; + int32_t *s32 = (int32_t *)s; + for (i = 0; i < cnt; i++) { + _mm_stream_si32(d32, *s32); + d32++; + s32++; + } + cnt = len & DWORD_MASK; + uint8_t *d8 = (uint8_t *)d32; + const uint8_t *s8 = (uint8_t *)s32; + + for (i = 0; i < cnt; i++) { + *d8 = *s8; + d8++; + s8++; + } + pmem_flush(d32, cnt); + + /* copy the last bytes (<16), first dwords then bytes */ + + } + } else { + /* + * Copy the range in the backward direction. + * + * This prevents overwriting source data due to an + * overlapped destination range. + */ + + dest1 = (char *)dest1 + len; + src = (char *)src + len; + + cnt = (uint64_t)dest1 & ALIGN_MASK; + if (cnt > 0) { + /* never try to copy more the len bytes */ + if (cnt > len) + cnt = len; + + uint8_t *d8 = (uint8_t *)dest1; + const uint8_t *s8 = (uint8_t *)src; + for (i = 0; i < cnt; i++) { + d8--; + s8--; + *d8 = *s8; + } + pmem_flush(d8, cnt); + dest1 = (char *)dest1 - cnt; + src = (char *)src - cnt; + len -= cnt; + } + + d = (__m128i *)dest1; + s = (__m128i *)src; + + cnt = len >> CHUNK_SHIFT_GRANULARITY; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s - 1); + xmm1 = _mm_loadu_si128(s - 2); + xmm2 = _mm_loadu_si128(s - 3); + _mm_stream_si128(d - 4, xmm3); + d -= 4; + } + + /* copy the tail (<128 bytes) in 16 bytes chunks */ + len &= CHUNK_MASK_GRANULARITY; + if (len != 0) { + cnt = len >> MOVNT_SHIFT; + for (i = 0; i < cnt; i++) { + d--; + s--; + xmm0 = _mm_loadu_si128(s); + _mm_stream_si128(d, xmm0); + } + } + + /* copy the last bytes (<16), first dwords then bytes */ + len &= MOVNT_MASK; + if (len != 0) { + cnt = len >> DWORD_SHIFT; + int32_t *d32 = (int32_t *)d; + int32_t *s32 = (int32_t *)s; + for (i = 0; i < cnt; i++) { + d32--; + s32--; + _mm_stream_si32(d32, *s32); + } + + cnt = len & DWORD_MASK; + uint8_t *d8 = (uint8_t *)d32; + const uint8_t *s8 = (uint8_t *)s32; + + for (i = 0; i < cnt; i++) { + d8--; + s8--; + *d8 = *s8; + } + pmem_flush(d8, cnt); + } + } + + /* + * The call to pmem_*_nodrain() should be followed by pmem_drain() + * to serialize non-temporal store instructions. (It could be only + * one drain after a sequence of pmem_*_nodrain calls). + * However, on platforms that only support strongly-ordered CLFLUSH + * for flushing the CPU cache (or that are forced to not use + * CLWB/CLFLUSHOPT) there is no need to put any memory barrier after + * the flush, so the pmem_drain() is a no-op function. In such case, + * we need to put a memory barrier here. + */ + //predrain_memory_barrier(); + + return pmemdest; +} + +/* + * memmove_nodrain_movnt -- (internal) memmove to pmem without hw drain, movnt + */ +static void * +memmove_nodrain_movnt(void *pmemdest, const void *src, size_t len) +{ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + size_t i; + __m128i *d; + __m128i *s; + void *dest1 = pmemdest; + size_t cnt; + + if (len == 0 || src == pmemdest) + return pmemdest; + + if (len < Movnt_threshold) { + memmove(pmemdest, src, len); + pmem_flush(pmemdest, len); + return pmemdest; + } + + if ((uintptr_t)dest1 - (uintptr_t)src >= len) { + /* + * Copy the range in the forward direction. + * + * This is the most common, most optimized case, used unless + * the overlap specifically prevents it. + */ + /* copy up to FLUSH_ALIGN boundary */ + cnt = (uint64_t)dest1 & ALIGN_MASK; + if (cnt > 0) { + cnt = FLUSH_ALIGN - cnt; + + /* never try to copy more the len bytes */ + if (cnt > len) + cnt = len; + + uint8_t *d8 = (uint8_t *)dest1; + const uint8_t *s8 = (uint8_t *)src; + for (i = 0; i < cnt; i++) { + *d8 = *s8; + d8++; + s8++; + } + pmem_flush(dest1, cnt); + dest1 = (char *)dest1 + cnt; + src = (char *)src + cnt; + len -= cnt; + } + + d = (__m128i *)dest1; + s = (__m128i *)src; + + cnt = len >> CHUNK_SHIFT; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s); + xmm1 = _mm_loadu_si128(s + 1); + xmm2 = _mm_loadu_si128(s + 2); + xmm3 = _mm_loadu_si128(s + 3); + xmm4 = _mm_loadu_si128(s + 4); + xmm5 = _mm_loadu_si128(s + 5); + xmm6 = _mm_loadu_si128(s + 6); + xmm7 = _mm_loadu_si128(s + 7); + s += 8; + _mm_stream_si128(d, xmm0); + _mm_stream_si128(d + 1, xmm1); + _mm_stream_si128(d + 2, xmm2); + _mm_stream_si128(d + 3, xmm3); + _mm_stream_si128(d + 4, xmm4); + _mm_stream_si128(d + 5, xmm5); + _mm_stream_si128(d + 6, xmm6); + _mm_stream_si128(d + 7, xmm7); + d += 8; + } + + /* copy the tail (<128 bytes) in 16 bytes chunks */ + len &= CHUNK_MASK; + if (len != 0) { + cnt = len >> MOVNT_SHIFT; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s); + _mm_stream_si128(d, xmm0); + s++; + d++; + } + } + len &= MOVNT_MASK; + if (len != 0) { + cnt = len >> DWORD_SHIFT; + int32_t *d32 = (int32_t *)d; + int32_t *s32 = (int32_t *)s; + for (i = 0; i < cnt; i++) { + _mm_stream_si32(d32, *s32); + d32++; + s32++; + } + cnt = len & DWORD_MASK; + uint8_t *d8 = (uint8_t *)d32; + const uint8_t *s8 = (uint8_t *)s32; + + for (i = 0; i < cnt; i++) { + *d8 = *s8; + d8++; + s8++; + } + pmem_flush(d32, cnt); + + /* copy the last bytes (<16), first dwords then bytes */ + + } + } else { + /* + * Copy the range in the backward direction. + * + * This prevents overwriting source data due to an + * overlapped destination range. + */ + + dest1 = (char *)dest1 + len; + src = (char *)src + len; + + cnt = (uint64_t)dest1 & ALIGN_MASK; + if (cnt > 0) { + /* never try to copy more the len bytes */ + if (cnt > len) + cnt = len; + + uint8_t *d8 = (uint8_t *)dest1; + const uint8_t *s8 = (uint8_t *)src; + for (i = 0; i < cnt; i++) { + d8--; + s8--; + *d8 = *s8; + } + pmem_flush(d8, cnt); + dest1 = (char *)dest1 - cnt; + src = (char *)src - cnt; + len -= cnt; + } + + d = (__m128i *)dest1; + s = (__m128i *)src; + + cnt = len >> CHUNK_SHIFT; + for (i = 0; i < cnt; i++) { + xmm0 = _mm_loadu_si128(s - 1); + xmm1 = _mm_loadu_si128(s - 2); + xmm2 = _mm_loadu_si128(s - 3); + xmm3 = _mm_loadu_si128(s - 4); + xmm4 = _mm_loadu_si128(s - 5); + xmm5 = _mm_loadu_si128(s - 6); + _mm_stream_si128(d - 7, xmm6); + _mm_stream_si128(d - 8, xmm7); + d -= 8; + } + + /* copy the tail (<128 bytes) in 16 bytes chunks */ + len &= CHUNK_MASK; + if (len != 0) { + cnt = len >> MOVNT_SHIFT; + for (i = 0; i < cnt; i++) { + d--; + s--; + xmm0 = _mm_loadu_si128(s); + _mm_stream_si128(d, xmm0); + } + } + + /* copy the last bytes (<16), first dwords then bytes */ + len &= MOVNT_MASK; + if (len != 0) { + cnt = len >> DWORD_SHIFT; + int32_t *d32 = (int32_t *)d; + int32_t *s32 = (int32_t *)s; + for (i = 0; i < cnt; i++) { + d32--; + s32--; + _mm_stream_si32(d32, *s32); + } + + cnt = len & DWORD_MASK; + uint8_t *d8 = (uint8_t *)d32; + const uint8_t *s8 = (uint8_t *)s32; + + for (i = 0; i < cnt; i++) { + d8--; + s8--; + *d8 = *s8; + } + pmem_flush(d8, cnt); + } + } + + /* + * The call to pmem_*_nodrain() should be followed by pmem_drain() + * to serialize non-temporal store instructions. (It could be only + * one drain after a sequence of pmem_*_nodrain calls). + * However, on platforms that only support strongly-ordered CLFLUSH + * for flushing the CPU cache (or that are forced to not use + * CLWB/CLFLUSHOPT) there is no need to put any memory barrier after + * the flush, so the pmem_drain() is a no-op function. In such case, + * we need to put a memory barrier here. + */ + predrain_memory_barrier(); + + return pmemdest; +} + +/* +unsigned long __copy_user_intel_nocache(void *to, void *from, unsigned long size) { + int d0, d1; + + __asm__ __volatile__( + " .align 2,0x90\n" + "0: movl 32(%4), %%eax\n" + " cmpl $67, %0\n" + " jbe 2f\n" + "1: movl 64(%4), %%eax\n" + " .align 2,0x90\n" + "2: movl 0(%4), %%eax\n" + "21: movl 4(%4), %%edx\n" + " movnti %%eax, 0(%3)\n" + " movnti %%edx, 4(%3)\n" + "3: movl 8(%4), %%eax\n" + "31: movl 12(%4),%%edx\n" + " movnti %%eax, 8(%3)\n" + " movnti %%edx, 12(%3)\n" + "4: movl 16(%4), %%eax\n" + "41: movl 20(%4), %%edx\n" + " movnti %%eax, 16(%3)\n" + " movnti %%edx, 20(%3)\n" + "10: movl 24(%4), %%eax\n" + "51: movl 28(%4), %%edx\n" + " movnti %%eax, 24(%3)\n" + " movnti %%edx, 28(%3)\n" + "11: movl 32(%4), %%eax\n" + "61: movl 36(%4), %%edx\n" + " movnti %%eax, 32(%3)\n" + " movnti %%edx, 36(%3)\n" + "12: movl 40(%4), %%eax\n" + "71: movl 44(%4), %%edx\n" + " movnti %%eax, 40(%3)\n" + " movnti %%edx, 44(%3)\n" + "13: movl 48(%4), %%eax\n" + "81: movl 52(%4), %%edx\n" + " movnti %%eax, 48(%3)\n" + " movnti %%edx, 52(%3)\n" + "14: movl 56(%4), %%eax\n" + "91: movl 60(%4), %%edx\n" + " movnti %%eax, 56(%3)\n" + " movnti %%edx, 60(%3)\n" + " addl $-64, %0\n" + " addl $64, %4\n" + " addl $64, %3\n" + " cmpl $63, %0\n" + " ja 0b\n" + " sfence \n" + "5: movl %0, %%eax\n" + " shrl $2, %0\n" + " andl $3, %%eax\n" + " cld\n" + "6: rep; movsl\n" + " movl %%eax,%0\n" + "7: rep; movsb\n" + "8:\n" + ".section .fixup,\"ax\"\n" + "9: lea 0(%%eax,%0,4),%0\n" + "16: jmp 8b\n" + ".previous\n" + _ASM_EXTABLE(0b,16b) + _ASM_EXTABLE(1b,16b) + _ASM_EXTABLE(2b,16b) + _ASM_EXTABLE(21b,16b) + _ASM_EXTABLE(3b,16b) + _ASM_EXTABLE(31b,16b) + _ASM_EXTABLE(4b,16b) + _ASM_EXTABLE(41b,16b) + _ASM_EXTABLE(10b,16b) + _ASM_EXTABLE(51b,16b) + _ASM_EXTABLE(11b,16b) + _ASM_EXTABLE(61b,16b) + _ASM_EXTABLE(12b,16b) + _ASM_EXTABLE(71b,16b) + _ASM_EXTABLE(13b,16b) + _ASM_EXTABLE(81b,16b) + _ASM_EXTABLE(14b,16b) + _ASM_EXTABLE(91b,16b) + _ASM_EXTABLE(6b,9b) + _ASM_EXTABLE(7b,16b) + : "=&c"(size), "=&D" (d0), "=&S" (d1) + : "1"(to), "2"(from), "0"(size) + : "eax", "edx", "memory"); + return size; +}*/ + +#endif //if 0 +#endif diff --git a/splitfs_syscall_intercept/src/nvp_lock.h b/splitfs_syscall_intercept/src/nvp_lock.h new file mode 100644 index 0000000000..b62a320222 --- /dev/null +++ b/splitfs_syscall_intercept/src/nvp_lock.h @@ -0,0 +1,224 @@ +#ifndef __LEDGER_NVP_LOCK_H_ +#define __LEDGER_NVP_LOCK_H_ + +#include +#include + + +pthread_spinlock_t global_lock_closed_files; +pthread_spinlock_t global_lock; + +// This file describes a custom type of RW locks which are very fast in the read case +// but very slow in the write case. +// One lock exists per logical core. On read, a processor simply gets a rdlock on +// the lock with its number. On write, a processor must sequentially acquire a +// wrlock on every processor. + +#define cpuid(func,ax,bx,cx,dx)\ + __asm__ __volatile__ ("cpuid":\ + "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (func)); + +static inline int _nvp_get_cpuid(void) { + uint32_t eax=0; // output: eax + uint32_t ebx=0; // output: ebx + uint32_t ecx=1; // output: ecx + uint32_t edx=0; // output: edx + + cpuid(0x0B, eax,ebx,ecx,edx); + + int id = (int) (((int)(edx & 1) << 3) + ((int)(edx >> 4) & 1) + (int)(edx & 0xe)); + + return id; +} + +static inline int return_zero(void) +{ + return 0; +} + +#ifndef USE_PTHREAD_LOCK + #define USE_PTHREAD_LOCK 1 +#endif + +#ifndef USE_SCHED_GETCPU + #define USE_SCHED_GETCPU 1 +#endif + +#if USE_SINGLE_LOCK + #define GET_CPUID return_zero +#elif USE_SCHED_GETCPU + #define GET_CPUID sched_getcpu +#else + #define GET_CPUID _nvp_get_cpuid +#endif + +#if USE_SINGLE_LOCK + #define NVP_NUM_LOCKS 2 +#else + #define NVP_NUM_LOCKS 16 +#endif + +// double the number of logical cores: each lock takes up half a cache line, so to +// reduce contention we space them out across cache lines. +#if USE_PTHREAD_LOCK + +#define NVP_LOCK_DECL pthread_rwlock_t lock[NVP_NUM_LOCKS] + +#define NVP_LOCK_INIT(lock) { int iter; for(iter=0; iter= WR_HELD) \ +__sync_fetch_and_sub(&lock[cpuid * 2 * 16], 1); \ +DEBUG("NVP_RDLOCK acquired on CPU %i, lock %p\n", cpuid, &lock) + +#define NVP_LOCK_UNLOCK_RD(lock, cpuid) \ + DEBUG("NVP_RDLOCK releasing on CPU %i, lock %p\n", cpuid, &lock); \ +SANITY(cpuid<(NVP_NUM_LOCKS/2)); \ +__sync_fetch_and_sub(&lock[cpuid * 2 * 16], 1); \ +DEBUG("NVP_RDLOCK released on CPU %i, lock %p\n", cpuid, &lock) + + +#define NVP_LOCK_WR(lock) { int iter; \ + DEBUG("NVP_WRLOCK requested on cpu %i, lock %p\n", GET_CPUID(), &lock); \ + for(iter=0; iterlock, cpuid) +#define NVP_UNLOCK_HASH_TABLE_RD(tbl, cpuid) NVP_LOCK_UNLOCK_RD(tbl->lock, cpuid) +#define NVP_LOCK_HASH_TABLE_WR(tbl) NVP_LOCK_WR(tbl->lock) +#define NVP_UNLOCK_HASH_TABLE_WR(tbl) NVP_LOCK_UNLOCK_WR(tbl->lock) + +#else + +#define NVP_LOCK_HASH_TABLE_RD(tbl, cpuid) {(void)(cpuid);} +#define NVP_UNLOCK_HASH_TABLE_RD(tbl, cpuid) {(void)(cpuid);} +#define NVP_LOCK_HASH_TABLE_WR(tbl) {(void)(tbl->lock);} +#define NVP_UNLOCK_HASH_TABLE_WR(tbl) {(void)(tbl->lock);} + +#endif diff --git a/splitfs_syscall_intercept/src/nvp_printf.c b/splitfs_syscall_intercept/src/nvp_printf.c new file mode 100644 index 0000000000..538d21b5af --- /dev/null +++ b/splitfs_syscall_intercept/src/nvp_printf.c @@ -0,0 +1,390 @@ +/*---------------------------------------------------*/ +/* Modified from : */ +/* Public Domain version of printf */ +/* Rud Merriam, Compsult, Inc. Houston, Tx. */ +/* For Embedded Systems Programming, 1991 */ +/* */ +/*---------------------------------------------------*/ + +#ifndef __NVP_XIL_PRINTF_C_ +#define __NVP_XIL_PRINTF_C_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +FILE * _xil_printf_file; + +FILE* _nvp_print_fd; + + +void printString(char *s); + + +static pthread_mutex_t __debug_mutex = PTHREAD_MUTEX_INITIALIZER; + +int max_len = 0; +int pos = 0; +char * outbuf = NULL; + +static void crush_outbuf(void) { + memset(outbuf, '\0', max_len); + pos=0; +} + +static void flush_outbuf(void) +{ + static size_t (*glibc_fwrite) ( const void * ptr, size_t size, size_t count, FILE * stream ) = NULL; + + if(glibc_fwrite==NULL) { + //void* libc_so = dlopen("/lib64/libc.so.6", RTLD_LAZY|RTLD_LOCAL); + void *libc_so = dlopen("/lib/x86_64-linux-gnu/libc.so.6", RTLD_LAZY|RTLD_LOCAL); + if(!libc_so) { assert(0); } + void* glcw = dlsym(libc_so, "fwrite"); + if(!glcw) { assert(0); } + glibc_fwrite = (size_t (*) ( const void * ptr, size_t size, size_t count, FILE * stream ))glcw; + assert(glibc_fwrite!=NULL); + } + int errno_holder = errno; + int ret; + int ret_count = 0; + do { + ret = glibc_fwrite( outbuf, pos, 1, _xil_printf_file); + ret_count++; + } while((ret!=1) && (ret_count < 20)); + + if(ret!=1) { + //xil_printf(stderr, "\n\nERROR: nvp_printf.c: glibc_fwrite returned %i, expected 1: %s\n\n", ret, strerror(errno)); fflush(stderr); + printString("ERROR: nvp_printf.c: glibc_fwrite returned something other than 1!\n"); + while(ret != 1) {} + assert(0); + } + errno = errno_holder; + fflush(_xil_printf_file); + pos=0; +} + +void outbyte (char c) +{ + if(pos >= max_len-1) { + max_len *= 2; + outbuf = (char*) realloc(outbuf, max_len); + memset(outbuf+pos, '\0', max_len-pos); + } + + outbuf[pos] = c; + pos++; +} + +void printString(char *s){ + int i = 0; + while(s[i]) { + outbyte(s[i]); + i++; + } +} +/*----------------------------------------------------*/ +/* Use the following parameter passing structure to */ +/* make xil_printf re-entrant. */ +/*----------------------------------------------------*/ +typedef struct params_s { + int len; + int num1; + int num2; + char pad_character; + int do_padding; + int left_flag; +} params_t; + +/*---------------------------------------------------*/ +/* The purpose of this routine is to output data the */ +/* same as the standard printf function without the */ +/* overhead most run-time libraries involve. Usually */ +/* the printf brings in many kilobytes of code and */ +/* that is unacceptable in most embedded systems. */ +/*---------------------------------------------------*/ + +typedef int (*func_ptr)(int c); + +/*---------------------------------------------------*/ +/* */ +/* This routine puts pad characters into the output */ +/* buffer. */ +/* */ +static void padding( const int l_flag, params_t *par) +{ + int i; + + if (par->do_padding && l_flag && (par->len < par->num1)) + for (i=par->len; inum1; i++) + outbyte( par->pad_character); +} + +/*---------------------------------------------------*/ +/* */ +/* This routine moves a string to the output buffer */ +/* as directed by the padding and positioning flags. */ +/* */ +static void outs(charptr lp, params_t *par) +{ + /* pad on left if needed */ + if(lp == NULL) { lp = "(null)"; } + par->len = strlen( lp); + padding( !(par->left_flag), par); + + /* Move string to the buffer */ + while (*lp && (par->num2)--) + outbyte( *lp++); + + /* Pad on right if needed */ + /* CR 439175 - elided next stmt. Seemed bogus. */ + /* par->len = strlen( lp); */ + padding( par->left_flag, par); +} + +/*---------------------------------------------------*/ +/* */ +/* This routine moves a number to the output buffer */ +/* as directed by the padding and positioning flags. */ +/* */ + +static void outnum( const long long n, const long long base, params_t *par) +{ + charptr cp; + int negative; + char outbuf[32]; + const char digits[] = "0123456789ABCDEF"; + unsigned long long num; + + /* Check if number is negative */ + if (base == 10 && n < 0L) { + negative = 1; + num = -(n); + } + else{ + num = (n); + negative = 0; + } + + /* Build number (backwards) in outbuf */ + cp = outbuf; + do { + *cp++ = digits[(int)(num % base)]; + } while ((num /= base) > 0); + if (negative) + *cp++ = '-'; + *cp-- = 0; + + /* Move the converted number to the buffer and */ + /* add in the padding where needed. */ + par->len = strlen(outbuf); + padding( !(par->left_flag), par); + while (cp >= outbuf) + outbyte( *cp--); + padding( par->left_flag, par); +} + +/*---------------------------------------------------*/ +/* */ +/* This routine gets a number from the format */ +/* string. */ +/* */ +static int getnum( charptr* linep) +{ + int n; + charptr cp; + + n = 0; + cp = *linep; + while (isdigit(*cp)) + n = n*10 + ((*cp++) - '0'); + *linep = cp; + return(n); +} + +/*---------------------------------------------------*/ +/* */ +/* This routine operates just like a printf/sprintf */ +/* routine. It outputs a set of data under the */ +/* control of a formatting string. Not all of the */ +/* standard C format control are supported. The ones */ +/* provided are primarily those needed for embedded */ +/* systems work. Primarily the floaing point */ +/* routines are omitted. Other formats could be */ +/* added easily by following the examples shown for */ +/* the supported formats. */ +/* */ + +/* void esp_printf( const func_ptr f_ptr, + const charptr ctrl1, ...) */ +void xil_printf( FILE * file, const charptr ctrl1, ...) +{ + pthread_mutex_lock(&__debug_mutex); + + _xil_printf_file = file; + + + int long_flag; + int dot_flag; + + params_t par; + + char ch; + va_list argp; + charptr ctrl = ctrl1; + + va_start( argp, ctrl1); + + + if(outbuf == NULL) { + max_len = 512; + outbuf = (char*) calloc(max_len, sizeof(char)); + } + + crush_outbuf(); + + for ( ; *ctrl; ctrl++) { + + /* move format string chars to buffer until a */ + /* format control is found. */ + if (*ctrl != '%') { + outbyte(*ctrl); + continue; + } + + /* initialize all the flags for this format. */ + dot_flag = long_flag = par.left_flag = par.do_padding = 0; + par.pad_character = ' '; + par.num2=32767; + + try_next: + ch = *(++ctrl); + + if (isdigit(ch)) { + if (dot_flag) + par.num2 = getnum(&ctrl); + else { + if (ch == '0') + par.pad_character = '0'; + + par.num1 = getnum(&ctrl); + par.do_padding = 1; + } + ctrl--; + goto try_next; + } + + switch (tolower(ch)) { + case '%': + outbyte( '%'); + continue; + + case '-': + par.left_flag = 1; + break; + + case '.': + dot_flag = 1; + break; + + case 'l': + long_flag = 1; + break; + + case 'i': + case 'd': + case 'u': + if (long_flag || ch == 'D') { + outnum( va_arg(argp, long), 10L, &par); + continue; + } + else { + outnum( va_arg(argp, int), 10L, &par); + continue; + } + case 'p': + outbyte('0'); + outbyte('x'); + case 'x': + outnum((long long)va_arg(argp, long long), 16L, &par); + continue; + + case 's': + outs( va_arg( argp, charptr), &par); + continue; + + case 'c': + outbyte( va_arg( argp, int)); + continue; + + case '\\': + switch (*ctrl) { + case 'a': + outbyte( 0x07); + break; + case 'h': + outbyte( 0x08); + break; + case 'r': + outbyte( 0x0D); + break; + case 'n': + outbyte( 0x0D); + outbyte( 0x0A); + break; + default: + outbyte( *ctrl); + break; + } + ctrl++; + break; + + case '\0': + continue; + + default: + printString("Illegal format character: '"); + char t[2]; + t[1] = 0; + t[0] = ch; + printString(t); + printString("'. Good bye.\n"); + assert(0); // this code sucks + continue; + + } + goto try_next; + } + va_end( argp); + + flush_outbuf(); + + pthread_mutex_unlock(&__debug_mutex); + +} + +/*---------------------------------------------------*/ + + +#endif + +#define MK_STR(x) #x + +#ifndef _NVP_PRINT_ERROR_NAME_FDEF_ +#define _NVP_PRINT_ERROR_NAME_FDEF_ +void _nvp_print_error_name(int errnoin) +{ +BOOST_PP_LIST_FOR_EACH(ERROR_IF_PRINT, errnoin, ERROR_NAMES_LIST) +} +#endif + diff --git a/splitfs_syscall_intercept/src/read.c b/splitfs_syscall_intercept/src/read.c new file mode 100644 index 0000000000..7b1c4fbd2a --- /dev/null +++ b/splitfs_syscall_intercept/src/read.c @@ -0,0 +1,494 @@ +/* + * ===================================================================================== + * + * Filename: read.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:17:00 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +// required for sched_getcpu (GET_CPUID) +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif +#include +#include +#include + +#include +#include "timers.h" +#include "handle_mmaps.h" +#include "tbl_mmaps.h" +#include "nvp_lock.h" +#include "file.h" +#include "inode.h" +#include "staging.h" +#include "log.h" +#include "fsync.h" +#include "add_delay.h" + +static ssize_t _nvp_read_beyond_true_length(int file, void *buf, size_t count, off_t offset, int wr_lock, int cpuid, + struct NVFile *nvf, struct NVTable_maps *tbl_app, struct NVTable_maps *tbl_over) +{ + size_t len_to_read, extent_length, read_count; + unsigned long mmap_addr; + off_t read_offset_beyond_true_length, offset_within_mmap; + instrumentation_type copy_appendread_time, get_dr_mmap_time; + + num_anon_read++; + + //printf("%s: here beyond true length\n", __func__); + read_count = 0; + len_to_read = count; + + read_offset_beyond_true_length = offset - nvf->node->true_length; + + while (len_to_read > 0) { + +#if DATA_JOURNALING_ENABLED + + read_tbl_mmap_entry(nvf->node, + offset, + len_to_read, + &mmap_addr, + &extent_length, + 1); + if (mmap_addr == 0) { + START_TIMING(get_dr_mmap_t, get_dr_mmap_time); + nvp_get_dr_mmap_address(nvf, read_offset_beyond_true_length, + len_to_read, read_count, + &mmap_addr, &offset_within_mmap, &extent_length, + wr_lock, cpuid, 0, tbl_app, tbl_over); + END_TIMING(get_dr_mmap_t, get_dr_mmap_time); + } + +#else // DATA_JOURNALING_ENABLED + + START_TIMING(get_dr_mmap_t, get_dr_mmap_time); + nvp_get_dr_mmap_address(nvf, read_offset_beyond_true_length, + len_to_read, read_count, + &mmap_addr, &offset_within_mmap, &extent_length, + wr_lock, cpuid, 0, tbl_app, tbl_over); + END_TIMING(get_dr_mmap_t, get_dr_mmap_time); + +#endif // DATA_JOURNALING_ENABLED + + if(extent_length > len_to_read) + extent_length = len_to_read; + + START_TIMING(copy_appendread_t, copy_appendread_time); + if(FSYNC_MEMCPY(buf, (char *)mmap_addr, extent_length) != buf) { + MSG("%s: memcpy read failed\n", __func__); + assert(0); + } + +#if NVM_DELAY + perfmodel_add_delay(1, extent_length); +#endif + END_TIMING(copy_appendread_t, copy_appendread_time); + num_memcpy_read++; + memcpy_read_size += extent_length; + read_offset_beyond_true_length += extent_length; + read_count += extent_length; + buf += extent_length; + len_to_read -= extent_length; + anon_read_size += extent_length; + } + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + return read_count; +} + +ssize_t read_from_file_mmap(int file, + off_t read_offset_within_true_length, + size_t len_to_read_within_true_length, + int wr_lock, + int cpuid, + void *buf, + struct NVFile *nvf, + struct NVTable_maps *tbl_app, + struct NVTable_maps *tbl_over) +{ + int ret = 0, ret_get_addr = 0; + unsigned long mmap_addr = 0, bitmap_root = 0; + off_t offset_within_mmap = 0; + size_t extent_length = 0, read_count = 0, posix_read = 0; + instrumentation_type copy_overread_time, get_mmap_time; + + START_TIMING(get_mmap_t, get_mmap_time); + ret = nvp_get_mmap_address(nvf, + read_offset_within_true_length, + read_count, + &mmap_addr, + &bitmap_root, + &offset_within_mmap, + &extent_length, + wr_lock, + cpuid, + tbl_app, + tbl_over); + END_TIMING(get_mmap_t, get_mmap_time); + + switch (ret) { + case 0: // Mmaped. Do memcpy. + break; + case 1: // Not mmaped. Calling Posix pread. + posix_read = syscall_no_intercept(SYS_pread64, file, + buf, len_to_read_within_true_length, + read_offset_within_true_length); + num_posix_read++; + posix_read_size += posix_read; + + return posix_read; + default: + break; + } + + if (extent_length > len_to_read_within_true_length) + extent_length = len_to_read_within_true_length; + + START_TIMING(copy_overread_t, copy_overread_time); + DEBUG_FILE("%s: Reading from addr = %p, offset = %lu, size = %lu\n", __func__, (void *) mmap_addr, offset_within_mmap, extent_length); + if(FSYNC_MEMCPY(buf, (const void * restrict)mmap_addr, extent_length) != buf) { + printf("%s: memcpy read failed\n", __func__); + fflush(NULL); + assert(0); + } + +#if NVM_DELAY + perfmodel_add_delay(1, extent_length); +#endif + + END_TIMING(copy_overread_t, copy_overread_time); + + num_memcpy_read++; + memcpy_read_size += extent_length; + + return extent_length; +} + +static ssize_t _nvp_do_pread(int file, void *buf, size_t count, off_t offset, int wr_lock, int cpuid, struct NVFile *nvf, struct NVTable_maps *tbl_app, struct NVTable_maps *tbl_over) +{ + SANITYCHECKNVF(nvf); + long long read_offset_within_true_length = 0; + size_t read_count, extent_length, read_count_beyond_true_length; + size_t len_to_read_within_true_length; + size_t posix_read = 0; + unsigned long mmap_addr = 0; + unsigned long bitmap_root = 0; + off_t offset_within_mmap; + ssize_t available_length = (nvf->node->length) - offset; + instrumentation_type copy_overread_time, read_tbl_mmap_time; + + if (UNLIKELY(!nvf->canRead)) { + DEBUG("FD not open for reading: %i\n", file); + errno = EBADF; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + + return -1; + } + + else if (UNLIKELY(offset < 0)) + { + DEBUG("Requested read at negative offset (%li)\n", offset); + errno = EINVAL; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + return -1; + } + + if(nvf->aligned) + { + DEBUG("This read must be aligned. Checking alignment.\n"); + + if(UNLIKELY(available_length <= 0)) + { + DEBUG("Actually there weren't any bytes available " + "to read. Bye! (length %li, offset %li, " + "available_length %li)\n", nvf->node->length, + offset, available_length); + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + return 0; + } + + if(UNLIKELY(count % 512)) + { + DEBUG("cout is not aligned to 512 (count was %i)\n", + count); + + errno = EINVAL; + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + return -1; + } + if(UNLIKELY(offset % 512)) + { + DEBUG("offset was not aligned to 512 (offset was %i)\n", + offset); + + errno = EINVAL; + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + return -1; + } + if(UNLIKELY(((long long int)buf & (512-1)) != 0)) + { + DEBUG("buffer was not aligned to 512 (buffer was %p, " + "mod 512=%i)\n", buf, (long long int)buf % 512); + errno = EINVAL; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + return -1; + } + } + + ssize_t len_to_read = count; + if (count > available_length) + { + len_to_read = available_length; + DEBUG("Request read length was %li, but only %li bytes " + "available. (filelen = %li, offset = %li, " + "requested %li)\n", count, len_to_read, + nvf->node->length, offset, count); + } + + if(UNLIKELY( (len_to_read <= 0) || (available_length <= 0) )) + { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + return 0; // reading 0 bytes is easy! + } + + DEBUG("mmap is length %li, len_to_read is %li\n", nvf->node->maplength, + len_to_read); + + SANITYCHECK(len_to_read + offset <= nvf->node->length); + + read_count = 0; + + /* + * if data to be read <= true_length_for_read, then it can be read from file backed mmap. Otherwise, it can be + * read from anonymous mmap + * len_to_read_within_true_length = amount of data that can be read using file backed mmap. + */ + read_offset_within_true_length = (offset > nvf->node->true_length) ? -1 : offset; + + if(read_offset_within_true_length == -1) + len_to_read_within_true_length = 0; + else { + len_to_read_within_true_length = (len_to_read + offset > nvf->node->true_length) ? nvf->node->true_length - offset : len_to_read; + } + + DEBUG_FILE("%s: len of read request = %lu, offset = %lu. True Size of file = %lu. Fake file size = %lu\n", __func__, len_to_read_within_true_length, read_offset_within_true_length, nvf->node->true_length, nvf->node->length); + + while (len_to_read_within_true_length > 0) { + // Get the file backed mmap address from which the read is to be performed. + + START_TIMING(read_tbl_mmap_t, read_tbl_mmap_time); + read_tbl_mmap_entry(nvf->node, + read_offset_within_true_length, + len_to_read_within_true_length, + &mmap_addr, + &extent_length, + 1); + END_TIMING(read_tbl_mmap_t, read_tbl_mmap_time); + + DEBUG_FILE("%s: addr to read = %p, size to read = %lu. Inode = %lu\n", __func__, mmap_addr, extent_length, nvf->node->serialno); + DEBUG("Pread: get_mmap_address returned %d, length %llu\n", + ret, extent_length); + + if (mmap_addr == 0) { + extent_length = read_from_file_mmap(file, + read_offset_within_true_length, + len_to_read_within_true_length, + wr_lock, + cpuid, + buf, + nvf, + tbl_app, + tbl_over); + goto post_read; + + } + + DEBUG_FILE("%s: memcpy args: buf = %p, mmap_addr = %p, length = %lu. File off = %lld. Inode = %lu\n", __func__, buf, (void *) mmap_addr, extent_length, read_offset_within_true_length, nvf->node->serialno); + START_TIMING(copy_overread_t, copy_overread_time); + if(FSYNC_MEMCPY(buf, + (void *)mmap_addr, + extent_length) != buf) { + printf("%s: memcpy read failed\n", __func__); + fflush(NULL); + assert(0); + } +#if NVM_DELAY + perfmodel_add_delay(1, extent_length); +#endif + END_TIMING(copy_overread_t, copy_overread_time); + // Add the NVM read latency + + num_memcpy_read++; + memcpy_read_size += extent_length; + post_read: + len_to_read -= extent_length; + len_to_read_within_true_length -= extent_length; + read_offset_within_true_length += extent_length; + read_count += extent_length; + buf += extent_length; + offset += extent_length; + } + + if(!len_to_read) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + DEBUG_FILE("%s: Returning from read over. Size = %lu\n", __func__, read_count); + return read_count; + } + + // If we need to read from anonymous memory, call _nvp_read_beyond_true_length + read_count_beyond_true_length = _nvp_read_beyond_true_length(file, + buf, + len_to_read, + offset, + wr_lock, + cpuid, + nvf, + tbl_app, + tbl_over); + read_count += read_count_beyond_true_length; + + DEBUG_FILE("%s: Returning from read beyond. Size = %lu\n", __func__, read_count); + return read_count; +} + +static ssize_t _nvp_check_read_size_valid(size_t count) +{ + if(count == 0) + { + DEBUG("Requested a read of 0 length. No problem\n"); + return 0; + } + else if(count < 0) + { + DEBUG("Requested read of negative bytes (%li)\n", count); + errno = EINVAL; + return -1; + } + + return count; +} + +RETT_SYSCALL_INTERCEPT _sfs_READ(INTF_SYSCALL) +{ + DEBUG_FILE("%s %d\n", __func__, file); + + int file; + size_t length; + char *buf; + instrumentation_type read_time; + + file = (int)arg0; + buf = (char *)arg1; + length = (size_t)arg2; + num_read++; + + if(!_fd_intercept_lookup[file]) { + return RETT_PASS_KERN; + } + + int res; + + START_TIMING(read_t, read_time); + GLOBAL_LOCK_WR(); + + struct NVFile* nvf = &_nvp_fd_lookup[file]; + struct NVTable_maps *tbl_app = &_nvp_tbl_mmaps[nvf->node->serialno % APPEND_TBL_MAX]; + +#if DATA_JOURNALING_ENABLED + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[nvf->node->serialno % OVER_TBL_MAX]; +#else + struct NVTable_maps *tbl_over = NULL; +#endif // DATA_JOURNALING_ENABLED + + if(nvf->posix) { + DEBUG("Call posix READ for fd %d\n", nvf->fd); + *result = syscall_no_intercept(SYS_read, file, buf, length); + read_size += *result; + num_posix_read++; + posix_read_size += *result; + END_TIMING(read_t, read_time); + GLOBAL_UNLOCK_WR(); + return RETT_NO_PASS_KERN; + } + + res = _nvp_check_read_size_valid(length); + if (res <= 0) { + END_TIMING(read_t, read_time); + GLOBAL_UNLOCK_WR(); + *result = -EINVAL; + return RETT_NO_PASS_KERN; + } + #define _GNU_SOURCE + int cpuid = GET_CPUID(); + + NVP_LOCK_FD_RD(nvf, cpuid); // TODO + NVP_LOCK_NODE_RD(nvf, cpuid); + + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + + res = _nvp_do_pread(file, buf, length, + __sync_fetch_and_add(nvf->offset, length), + 0, + cpuid, + nvf, + tbl_app, + tbl_over); + if(res < 0) { + // errno is set by _nvp_do_pread + *result = -errno; + } + + + NVP_UNLOCK_NODE_RD(nvf, cpuid); + + if(res == length) { + DEBUG("PREAD succeeded: extending offset from %li to %li\n", + *nvf->offset - res, *nvf->offset); + } + else if (res <= 0){ + DEBUG("_nvp_READ: PREAD failed; not changing offset. " + "(returned %i)\n", res); + //assert(0); // TODO: this is for testing only + __sync_fetch_and_sub(nvf->offset, length); + } else { + DEBUG("_nvp_READ: PREAD failed; Not fully read. " + "(returned %i)\n", res); + // assert(0); // TODO: this is for testing only + __sync_fetch_and_sub(nvf->offset, length - res); + } + + NVP_UNLOCK_FD_RD(nvf, cpuid); + + read_size += res; + + END_TIMING(read_t, read_time); + DEBUG_FILE("_nvp_READ %d returns %lu\n", file, result); + GLOBAL_UNLOCK_WR(); + *result = res; + return RETT_NO_PASS_KERN; +} diff --git a/splitfs_syscall_intercept/src/relink.c b/splitfs_syscall_intercept/src/relink.c new file mode 100644 index 0000000000..badc87b844 --- /dev/null +++ b/splitfs_syscall_intercept/src/relink.c @@ -0,0 +1,542 @@ +/* + * ===================================================================================== + * + * Filename: relink.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:39:05 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#include +#include "timers.h" +#include "add_delay.h" +#include "handle_mmaps.h" +#include "tbl_mmaps.h" +#include "nvp_lock.h" +#include "staging.h" +#include "log.h" +#include "relink.h" +#include "utils.h" + +#if DATA_JOURNALING_ENABLED + +static inline size_t dynamic_remap_large(int file_fd, struct NVNode *node, int close) +{ + size_t len_to_write = 0, len_written = 0, len_to_swap = 0, len_swapped = 0; + off_t app_start_addr = 0; + off_t app_start_off = 0; + off_t file_start_off = node->true_length; + off_t over_file_start = 0, over_file_end = 0; + off_t over_dr_start = 0, over_dr_end = 0; + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[node->serialno % OVER_TBL_MAX]; + struct NVTable_regions *regions = _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].regions; + int region_id = 0; + int valid = 0, i = 0, tbl_idx = 0; + int max_region_id = 0; + instrumentation_type swap_extents_time, insert_tbl_mmap_time; + + DEBUG_FILE("%s: START: file_fd = %d. dr start addr = %p, dr over start addr = %p, true_length = %lu, length = %lu, Inode number = %lu\n", + __func__, file_fd, node->dr_info.start_addr, node->dr_over_info.start_addr, node->true_length, node->length, node->serialno); + + if (node->dr_info.start_addr == 0 && node->dr_over_info.start_addr == 0) + return 0; + + if (node->dr_info.start_addr != 0) { + app_start_addr = node->dr_info.start_addr; + app_start_off = node->dr_info.dr_offset_start; + } + + i = _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].min_dirty_region; + while (i < _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].max_dirty_region) { + if (regions[i].highest_off >= node->true_length) { + break; + } + i++; + } + + max_region_id = _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].max_dirty_region; + region_id = _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].min_dirty_region; + while (region_id <= max_region_id) { + tbl_idx = 0; + while (tbl_idx < tbl_over->tbl_mmap_index) { + get_tbl_elem_large(&over_file_start, + &over_file_end, + &over_dr_start, + &over_dr_end, + regions[region_id].tbl_mmaps, + tbl_idx); + + if (over_dr_end == 0) { + tbl_idx++; + continue; + } + + if (over_dr_start > over_dr_end) { + MSG("%s: over_file_start = %lld, over_file_end = %lld, over_dr_start = %lld, over_dr_end = %lld\n", __func__, over_file_start, over_file_end, over_dr_start, over_dr_end); + assert(0); + } + if (over_file_start > over_file_end) + assert(0); + if (over_dr_start > node->dr_over_info.dr_offset_end) + assert(0); + + len_to_swap = over_file_end - over_file_start + 1; + START_TIMING(swap_extents_t, swap_extents_time); + DEBUG_FILE("%s: Dynamic remap args: file_fd = %d, over_dr fd = %d, file_start = %lld, over_dr start = %lld, over_dr start addr = %p, len_to_swap = %lu\n", __func__, file_fd, node->dr_over_info.dr_fd, over_file_start, over_dr_start, (const char *) node->dr_over_info.start_addr, len_to_swap); + len_swapped = syscall(335, file_fd, + node->dr_over_info.dr_fd, + over_file_start, + over_dr_start, + (const char *) node->dr_over_info.start_addr, + len_to_swap); + + tbl_over->tbl_mmaps[tbl_idx].dr_end_off = 0; + END_TIMING(swap_extents_t, swap_extents_time); + num_appendfsync++; + tbl_idx++; + } + regions[region_id].region_dirty = 0; + if (_nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].min_dirty_region == region_id) + _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].min_dirty_region++; + + region_id++; + } + + while (region_id < _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].max_dirty_region) { + tbl_idx = 0; + while (regions[region_id].region_dirty == 1 && tbl_idx < regions[region_id].tbl_mmap_index) { + valid = get_lowest_tbl_elem_large(&over_file_start, + &over_file_end, + &over_dr_start, + &over_dr_end, + regions[region_id].tbl_mmaps, + regions[region_id].tbl_mmap_index, + regions[region_id].highest_off); + + if (valid == 0) + break; + + if (over_dr_start > over_dr_end) + assert(0); + if (over_file_start > over_file_end) + assert(0); + if (over_dr_start > node->dr_over_info.dr_offset_end) + assert(0); + + if (file_start_off < over_file_start && app_start_addr != 0) { + len_to_swap = over_file_start - file_start_off + 1; + app_start_off = node->dr_info.dr_offset_start + + file_start_off - node->true_length; + app_start_addr = node->dr_info.start_addr + + app_start_off; + + if (app_start_off > node->dr_info.dr_offset_end) + assert(0); + + // Perform swap extents from append DR file + START_TIMING(swap_extents_t, swap_extents_time); + DEBUG_FILE("%s: Dynamic remap args: file_fd = %d, app_dr fd = %d, file_start = %lld, app_dr start = %lld, app_dr start addr = %p, len_to_swap = %lu\n", __func__, file_fd, node->dr_info.dr_fd, file_start_off, app_start_off, (const char *) node->dr_info.start_addr, len_to_swap); + len_swapped = syscall(335, file_fd, + node->dr_info.dr_fd, + file_start_off, + app_start_off, + (const char *) node->dr_info.start_addr, + len_to_swap); + + END_TIMING(swap_extents_t, swap_extents_time); + num_appendfsync++; + len_written += len_swapped; + file_start_off += len_swapped; + START_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + insert_tbl_mmap_entry(node, + file_start_off, + app_start_off, + len_swapped, + app_start_addr); + END_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + } + + if (over_dr_start > over_dr_end) + assert(0); + if (over_file_start > over_file_end) + assert(0); + if (over_file_start != file_start_off) + assert(0); + if (over_dr_start > node->dr_over_info.dr_offset_end) + assert(0); + + // Perform swap extents based on over file + START_TIMING(swap_extents_t, swap_extents_time); + len_to_swap = over_file_end - over_file_start + 1; + DEBUG_FILE("%s: Dynamic remap args: file_fd = %d, over_dr fd = %d, file_start = %lld, over_dr start = %lld, over_dr start addr = %p, len_to_swap = %lu\n", __func__, file_fd, node->dr_over_info.dr_fd, file_start_off, over_dr_start, (const char *) node->dr_over_info.start_addr, len_to_swap); + len_swapped = syscall(335, file_fd, + node->dr_over_info.dr_fd, + over_file_start, + over_dr_start, + (const char *) node->dr_over_info.start_addr, + len_to_swap); + + tbl_over->tbl_mmaps[tbl_idx].dr_end_off = 0; + END_TIMING(swap_extents_t, swap_extents_time); + num_appendfsync++; + + if (over_file_start > node->true_length) + file_start_off += len_swapped; + len_written += len_swapped; + + tbl_idx++; + } + + regions[region_id].region_dirty = 0; + region_id++; + if (_nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].min_dirty_region == region_id) + _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].min_dirty_region++; + } + + _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].min_dirty_region = LARGE_TBL_REGIONS; + _nvp_tbl_regions[node->serialno % LARGE_TBL_MAX].max_dirty_region = 0; + + + if (app_start_addr != 0) { + app_start_off = node->dr_info.dr_offset_start + + file_start_off - node->true_length; + + if (node->dr_info.dr_offset_start > node->dr_info.dr_offset_end) + assert(0); + if (app_start_off > node->dr_info.dr_offset_end) + assert(0); + if ((app_start_off % MMAP_PAGE_SIZE) != (file_start_off % MMAP_PAGE_SIZE)) + assert(0); + + len_to_swap = node->dr_info.dr_offset_end - app_start_off; + + if (len_written < len_to_swap) { + app_start_addr = node->dr_info.start_addr + app_start_off; + + DEBUG_FILE("%s: Dynamic remap args: file_fd = %d, app_dr fd = %d, file_start = %lld, app_dr start = %lld, app_dr start addr = %p, len_to_swap = %lu\n", __func__, file_fd, node->dr_info.dr_fd, file_start_off, app_start_off, (const char *) node->dr_info.start_addr, len_to_swap); + // Perform swap extents from append DR file + START_TIMING(swap_extents_t, swap_extents_time); + len_swapped = syscall(335, file_fd, + node->dr_info.dr_fd, + file_start_off, + app_start_off, + (const char *) node->dr_info.start_addr, + len_to_swap); + + if (len_swapped != len_to_swap) + assert(0); + + END_TIMING(swap_extents_t, swap_extents_time); + num_appendfsync++; + START_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + insert_tbl_mmap_entry(node, + file_start_off, + app_start_off, + len_swapped, + app_start_addr); + END_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + len_written += len_swapped; + } + } + return len_written; +} + +static inline size_t dynamic_remap_updates(int file_fd, struct NVNode *node, int close, off_t *file_start_off) +{ + size_t len_to_write = 0, len_written = 0, len_to_swap = 0, len_swapped = 0; + off_t app_start_addr = 0; + off_t app_start_off = 0; + off_t over_file_start = 0, over_file_end = 0; + off_t over_dr_start = 0, over_dr_end = 0; + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[node->serialno % OVER_TBL_MAX]; + int idx_in_over = 0; + instrumentation_type swap_extents_time, insert_tbl_mmap_time; + + if (node->dr_info.start_addr != 0) { + app_start_addr = node->dr_info.start_addr; + app_start_off = node->dr_info.dr_offset_start; + } + + DEBUG_FILE("%s: START: file_fd = %d. dr start addr = %p, dr over start addr = %p, true_length = %lu, length = %lu, Inode number = %lu\n", + __func__, file_fd, node->dr_info.start_addr, node->dr_over_info.start_addr, node->true_length, node->length, node->serialno); + + if (node->dr_over_info.start_addr == 0) + return 0; + + if (node->is_large_file) + return dynamic_remap_large(file_fd, node, close); + + while (idx_in_over < tbl_over->tbl_mmap_index) { + get_lowest_tbl_elem(&over_file_start, + &over_file_end, + &over_dr_start, + &over_dr_end, + tbl_over, + idx_in_over); + + if (over_file_start >= node->true_length) + break; + + if (over_dr_end == 0) { + idx_in_over++; + continue; + } + + if (over_dr_start > over_dr_end) { + MSG("%s: over_file_start = %lld, over_file_end = %lld, over_dr_start = %lld, over_dr_end = %lld\n", __func__, over_file_start, over_file_end, over_dr_start, over_dr_end); + assert(0); + } + if (over_file_start > over_file_end) + assert(0); + if (over_dr_start > node->dr_over_info.dr_offset_end) + assert(0); + + len_to_swap = over_file_end - over_file_start + 1; + START_TIMING(swap_extents_t, swap_extents_time); + DEBUG_FILE("%s: Dynamic remap args: file_fd = %d, over_dr fd = %d, file_start = %lld, over_dr start = %lld, over_dr start addr = %p, len_to_swap = %lu\n", __func__, file_fd, node->dr_over_info.dr_fd, over_file_start, over_dr_start, (const char *) node->dr_over_info.start_addr, len_to_swap); + len_swapped = syscall(335, file_fd, + node->dr_over_info.dr_fd, + over_file_start, + over_dr_start, + (const char *) node->dr_over_info.start_addr, + len_to_swap); + + tbl_over->tbl_mmaps[idx_in_over].dr_end_off = 0; + END_TIMING(swap_extents_t, swap_extents_time); + num_appendfsync++; + idx_in_over++; + } + + while (idx_in_over < tbl_over->tbl_mmap_index) { + get_lowest_tbl_elem(&over_file_start, + &over_file_end, + &over_dr_start, + &over_dr_end, + tbl_over, + idx_in_over); + + if (over_dr_end == 0) { + idx_in_over++; + continue; + } + + if (over_dr_start > over_dr_end) + assert(0); + if (over_file_start > over_file_end) + assert(0); + if (over_dr_start > node->dr_over_info.dr_offset_end) + assert(0); + + if (*file_start_off < over_file_start && app_start_addr != 0) { + len_to_swap = over_file_start - *file_start_off + 1; + app_start_off = node->dr_info.dr_offset_start + + *file_start_off - node->true_length; + app_start_addr = node->dr_info.start_addr + + app_start_off; + + if (app_start_off > node->dr_info.dr_offset_end) + assert(0); + + // Perform swap extents from append DR file + START_TIMING(swap_extents_t, swap_extents_time); + DEBUG_FILE("%s: Dynamic remap args: file_fd = %d, app_dr fd = %d, file_start = %lld, app_dr start = %lld, app_dr start addr = %p, len_to_swap = %lu\n", __func__, file_fd, node->dr_info.dr_fd, file_start_off, app_start_off, (const char *) node->dr_info.start_addr, len_to_swap); + len_swapped = syscall(335, file_fd, + node->dr_info.dr_fd, + *file_start_off, + app_start_off, + (const char *) node->dr_info.start_addr, + len_to_swap); + + END_TIMING(swap_extents_t, swap_extents_time); + num_appendfsync++; + len_written += len_swapped; + *file_start_off += len_swapped; + START_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + insert_tbl_mmap_entry(node, + *file_start_off, + app_start_off, + len_swapped, + app_start_addr); + END_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + } + + if (over_dr_start > over_dr_end) + assert(0); + if (over_file_start > over_file_end) + assert(0); + if (over_file_start != *file_start_off) + assert(0); + if (over_dr_start > node->dr_over_info.dr_offset_end) + assert(0); + + // Perform swap extents based on over file + START_TIMING(swap_extents_t, swap_extents_time); + len_to_swap = over_file_end - over_file_start + 1; + DEBUG_FILE("%s: Dynamic remap args: file_fd = %d, over_dr fd = %d, file_start = %lld, over_dr start = %lld, over_dr start addr = %p, len_to_swap = %lu\n", __func__, file_fd, node->dr_over_info.dr_fd, file_start_off, over_dr_start, (const char *) node->dr_over_info.start_addr, len_to_swap); + len_swapped = syscall(335, file_fd, + node->dr_over_info.dr_fd, + over_file_start, + over_dr_start, + (const char *) node->dr_over_info.start_addr, + len_to_swap); + + tbl_over->tbl_mmaps[idx_in_over].dr_end_off = 0; + END_TIMING(swap_extents_t, swap_extents_time); + num_appendfsync++; + *file_start_off += len_swapped; + len_written += len_swapped; + + idx_in_over++; + } + return 0; +} + +#endif + +size_t dynamic_remap(int file_fd, struct NVNode *node, int close) { + size_t len_to_write = 0, len_written = 0, len_to_swap = 0, len_swapped = 0; + off_t app_start_addr = 0; + off_t app_start_off = 0; + off_t file_start_off = node->true_length; + off_t over_file_start = 0, over_file_end = 0; + off_t over_dr_start = 0, over_dr_end = 0; + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[node->serialno % OVER_TBL_MAX]; + int idx_in_over = 0; + instrumentation_type swap_extents_time, insert_tbl_mmap_time; + + DEBUG_FILE("%s: START: file_fd = %d. dr start addr = %p, dr over start addr = %p, true_length = %lu, length = %lu, Inode number = %lu\n", + __func__, file_fd, node->dr_info.start_addr, node->dr_over_info.start_addr, node->true_length, node->length, node->serialno); + +#if DATA_JOURNALING_ENABLED + + len_written = dynamic_remap_updates(file_fd, node, close, &file_start_off); + +#endif // DATA_JOURNALING_ENABLED + + len_written = 0; + + if (node->dr_info.start_addr == 0) + return len_written; + + if (node->dr_info.start_addr != 0) { + app_start_addr = node->dr_info.start_addr; + app_start_off = node->dr_info.dr_offset_start; + } + + if (node->dr_info.dr_offset_end - node->dr_info.dr_offset_start == 0) + return len_written; + + if (node->dr_info.dr_offset_end < node->dr_info.dr_offset_start) + return len_written; + + if (app_start_addr != 0) { + app_start_off = node->dr_info.dr_offset_start + + file_start_off - node->true_length; + + if (app_start_off > node->dr_info.dr_offset_end) + assert(0); + if ((app_start_off % MMAP_PAGE_SIZE) != (file_start_off % MMAP_PAGE_SIZE)) + assert(0); + if (app_start_off < node->dr_info.valid_offset) + assert(0); + + len_to_swap = node->dr_info.dr_offset_end - app_start_off; + + if (len_to_swap) { + app_start_addr = node->dr_info.start_addr + app_start_off; + + DEBUG_FILE("%s: file_inode = %lu, dr_inode = %lu, file_fd = %d, dr_fd = %d, " + "valid_offset = %lld, file_offset = %lld, dr_offset = %lld, len = %lu\n", + __func__, node->serialno, node->dr_info.dr_serialno, file_fd, + node->dr_info.dr_fd, node->dr_info.valid_offset, file_start_off, + app_start_off, len_to_swap); + + // Perform swap extents from append DR file + len_swapped = syscall(335, file_fd, + node->dr_info.dr_fd, + file_start_off, + app_start_off, + (const char *) node->dr_info.start_addr, + len_to_swap); + + if (len_swapped != len_to_swap) { + MSG("%s: len_swapped = %lu. Len to swap = %lu\n", __func__, len_swapped, len_to_swap); + if (len_swapped == -1) { + MSG("%s: Swap extents failed. Err = %s\n", __func__, strerror(errno)); + } + assert(0); + } + + END_TIMING(swap_extents_t, swap_extents_time); + num_appendfsync++; + START_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + insert_tbl_mmap_entry(node, + file_start_off, + app_start_off, + len_swapped, + app_start_addr); + END_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + len_written += len_swapped; + } + } + + return len_written; +} + + +size_t swap_extents(struct NVFile *nvf, int close) +{ + size_t len_swapped = 0; + off_t offset_in_page = 0; + + DEBUG_FILE("%s: before dynamic_remap, staging file inode = %lu, nvf->node->dr_info.valid_offset = %lld\n", + __func__, nvf->node->dr_info.dr_serialno, nvf->node->dr_info.valid_offset); + + len_swapped = dynamic_remap(nvf->fd, nvf->node, close); + + if (len_swapped > 0) { + nvf->node->true_length = nvf->node->length; + + if (nvf->node->true_length >= LARGE_FILE_THRESHOLD) + nvf->node->is_large_file = 1; + + nvf->node->dr_info.valid_offset = align_next_page(nvf->node->dr_info.dr_offset_end); + + if (nvf->node->dr_info.valid_offset < DR_SIZE) { + offset_in_page = nvf->node->true_length % PAGE_SIZE; + nvf->node->dr_info.valid_offset += offset_in_page; + } + + DEBUG_FILE("%s: Setting offset_start to DR_SIZE. FD = %d. Valid offset = %lu\n", __func__, nvf->fd, nvf->node->dr_info.valid_offset); + DEBUG_FILE("%s: -------------------------------\n", __func__); + + if (nvf->node->dr_info.valid_offset > DR_SIZE) + nvf->node->dr_info.valid_offset = DR_SIZE; + + DEBUG_FILE("%s: after dynamic_remap, staging file inode = %lu, nvf->node->dr_info.valid_offset = %lld\n", + __func__, nvf->node->dr_info.dr_serialno, nvf->node->dr_info.valid_offset); + + nvf->node->dr_info.dr_offset_start = DR_SIZE; + nvf->node->dr_info.dr_offset_end = nvf->node->dr_info.valid_offset; + + if (nvf->node->dr_info.valid_offset > DR_SIZE) + assert(0); + if (nvf->node->dr_info.dr_offset_start > DR_SIZE) + assert(0); + if (nvf->node->dr_info.dr_offset_end > DR_SIZE) + assert(0); + } + + + return len_swapped; +} + + diff --git a/splitfs_syscall_intercept/src/relink.h b/splitfs_syscall_intercept/src/relink.h new file mode 100644 index 0000000000..dcd4e003c4 --- /dev/null +++ b/splitfs_syscall_intercept/src/relink.h @@ -0,0 +1,28 @@ +/* + * ===================================================================================== + * + * Filename: relink.h + * + * Description: + * + * Version: 1.0 + * Created: 09/28/2019 11:33:39 AM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#ifndef SPLITFS_RELINK_H +#define SPLITFS_RELINK_H + +#include +#include "file.h" + +size_t dynamic_remap(int file_fd, struct NVNode *node, int close); +size_t swap_extents(struct NVFile *nvf, int close); +void perform_dynamic_remap(struct NVFile *nvf); + +#endif diff --git a/splitfs_syscall_intercept/src/rename.c b/splitfs_syscall_intercept/src/rename.c new file mode 100644 index 0000000000..548fa3785d --- /dev/null +++ b/splitfs_syscall_intercept/src/rename.c @@ -0,0 +1,31 @@ +#include +#include + +#include "timers.h" +#include "log.h" + +RETT_SYSCALL_INTERCEPT _sfs_RENAME(INTF_SYSCALL) +{ + DEBUG_FILE("CALL: %s\n", __func__); + + char *old, *new; + old = (char *)arg0; + new = (char *)arg1; + + *result = syscall_no_intercept(SYS_rename, old, new); + instrumentation_type op_log_entry_time; + // Write to op log + +#if !POSIX_ENABLED + if(*result == 0) { + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_RENAME, + old, + new, + 0, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); + } +#endif + return RETT_NO_PASS_KERN; +} \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/rmdir.c b/splitfs_syscall_intercept/src/rmdir.c new file mode 100644 index 0000000000..ea94fefbdf --- /dev/null +++ b/splitfs_syscall_intercept/src/rmdir.c @@ -0,0 +1,30 @@ +#include +#include + +#include "timers.h" +#include "log.h" + +RETT_SYSCALL_INTERCEPT _sfs_RMDIR(INTF_SYSCALL) +{ + DEBUG_FILE("CALL: %s\n", __func__); + + char *path; + path = (char *)arg0; + + *result = syscall_no_intercept(SYS_rmdir, path); + instrumentation_type op_log_entry_time; + // Write to op log + +#if !POSIX_ENABLED + if(*result == 0) { + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_DIR_DELETE, + path, + NULL, + 0, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); + } +#endif + return RETT_NO_PASS_KERN; +} \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/seek.c b/splitfs_syscall_intercept/src/seek.c new file mode 100644 index 0000000000..81446c2475 --- /dev/null +++ b/splitfs_syscall_intercept/src/seek.c @@ -0,0 +1,139 @@ +/* + * ===================================================================================== + * + * Filename: seek.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:53:31 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +// required for sched_getcpu (GET_CPUID) +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif +#include +#include + +#include +#include "nvp_lock.h" +#include "file.h" +#include "inode.h" +#include "timers.h" + +off64_t _nvp_do_seek64(int file, off64_t offset, int whence, struct NVFile *nvf) +{ + DEBUG("_nvp_do_seek64\n"); + + //struct NVFile* nvf = &_nvp_fd_lookup[file]; + + DEBUG("_nvp_do_seek64: file len %li, map len %li, current offset %li, " + "requested offset %li with whence %li\n", + nvf->node->length, nvf->node->maplength, *nvf->offset, + offset, whence); + + switch(whence) + { + case SEEK_SET: + if(offset < 0) + { + DEBUG("offset out of range (would result in " + "negative offset).\n"); + errno = EINVAL; + return -1; + } + *(nvf->offset) = offset ; + //if(offset == 0) + //INITIALIZE_TIMER(); + return *(nvf->offset); + + case SEEK_CUR: + if((*(nvf->offset) + offset) < 0) + { + DEBUG("offset out of range (would result in " + "negative offset).\n"); + errno = EINVAL; + return -1; + } + *(nvf->offset) += offset ; + return *(nvf->offset); + + case SEEK_END: + if( nvf->node->length + offset < 0 ) + { + DEBUG("offset out of range (would result in " + "negative offset).\n"); + errno = EINVAL; + return -1; + } + + *(nvf->offset) = nvf->node->length + offset; + return *(nvf->offset); + + default: + DEBUG("Invalid whence parameter.\n"); + errno = EINVAL; + return -1; + } + + assert(0); // unreachable + return -1; +} + +RETT_SYSCALL_INTERCEPT _sfs_SEEK(INTF_SYSCALL) +{ + DEBUG("%s\n", __func__); + int file = (int)arg0; + + if(!_fd_intercept_lookup[file]) { + return RETT_PASS_KERN; + } + + GLOBAL_LOCK_WR(); + + instrumentation_type seek_time; + int whence, ret; + off_t offset; + + offset = (off_t)arg1; + whence = (off_t)arg2; + + DEBUG("%s: %d\n", __func__, file); + START_TIMING(seek_t, seek_time); + + struct NVFile* nvf = &_nvp_fd_lookup[file]; + + if (nvf->posix) { + DEBUG("Call posix SEEK64 for fd %d\n", nvf->fd); + END_TIMING(seek_t, seek_time); + DEBUG_FILE("%s: END\n", __func__); + return syscall_no_intercept(SYS_lseek, file, offset, whence); + } + + int cpuid = GET_CPUID(); + + NVP_LOCK_FD_WR(nvf); + NVP_CHECK_NVF_VALID_WR(nvf); + NVP_LOCK_NODE_RD(nvf, cpuid); + + ret = _nvp_do_seek64(file, offset, whence, nvf); + + NVP_UNLOCK_NODE_RD(nvf, cpuid); + NVP_UNLOCK_FD_WR(nvf); + + END_TIMING(seek_t, seek_time); + + GLOBAL_UNLOCK_WR(); + if(ret == -1) { + *result = -errno; + } + *result = ret; + return RETT_NO_PASS_KERN; +} diff --git a/splitfs_syscall_intercept/src/splitfs_posix.c b/splitfs_syscall_intercept/src/splitfs_posix.c new file mode 100644 index 0000000000..3292d9a241 --- /dev/null +++ b/splitfs_syscall_intercept/src/splitfs_posix.c @@ -0,0 +1,723 @@ +/* + * ===================================================================================== + * + * Filename: _nvp_posix.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:43:16 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#include +#include +// a module which repalces the standart POSIX functions with memory mapped equivalents + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lru_cache.h" +#include "thread_handle.h" + +#include "bg_clear_mmap.h" +#include "add_delay.h" + +#include "staging.h" +#include "inode.h" +#include "timers.h" +#include "non_temporal.h" +#include "nvp_lock.h" +#include "stack.h" +#include "log.h" + +#include "file.h" +#include "tbl_mmaps.h" +#include "mmap_cache.h" +#include "handle_mmaps.h" +#include "fsync.h" +#include "execve.h" + +void _initialize_splitfs(void); +void* _nvp_zbuf; // holds all zeroes. used for aligned file extending. TODO: does sharing this hurt performance? + +// Define a system call intercepting function +typedef RETT_SYSCALL_INTERCEPT (*syscall_hook)(INTF_SYSCALL); + +// Map storing the system call function pointers. Index corresponds to system call number +syscall_hook syscall_hook_arr[512]; + + +void *(*import_memcpy)(void * __restrict__ b, const void * __restrict__ a, size_t n); + +extern void * __memcpy(void * __restrict__ to, const void * __restrict__ from, size_t len); + + +int _nvp_ino_lookup[1024]; +pthread_spinlock_t node_lookup_lock[NUM_NODE_LISTS]; +struct full_dr* _nvp_full_drs; +int full_dr_idx; +pthread_spinlock_t stack_lock; + +struct InodeToMapping* _nvp_ino_mapping; +struct NVFile* _nvp_fd_lookup; +struct NVNode *_nvp_node_lookup[NUM_NODE_LISTS]; + +struct StackNode *_nvp_free_node_list[NUM_NODE_LISTS]; +int _nvp_free_node_list_head[NUM_NODE_LISTS]; +struct NVNode *_nvp_node_lookup[NUM_NODE_LISTS]; + +int run_background_cleaning_thread; +int started_bg_cleaning_thread; +int exit_bg_cleaning_thread; +int calledBgCleaningThread; +int waiting_for_cleaning_signal; + +atomic_uint_fast64_t dr_mem_allocated; + +struct NVTable_maps *_nvp_tbl_mmaps; +struct NVTable_maps *_nvp_over_tbl_mmaps; +struct NVLarge_maps *_nvp_tbl_regions; + +pthread_spinlock_t global_lock; + +int OPEN_MAX; + +static void nvp_cleanup(void) +{ + int i, j; + +#if BG_CLOSING + while(!waiting_for_signal) + sleep(1); + + //cancel thread + cancelBgThread(); + exit_bgthread = 1; + cleanup = 1; + bgCloseFiles(1); +#endif + + nvp_free_dr_mmaps(); + free(_nvp_fd_lookup); + + for (i = 0; i < NUM_NODE_LISTS; i++) { + pthread_spin_lock(&node_lookup_lock[i]); + + for (j = 0; j< OPEN_MAX; j++) { + nvp_cleanup_node(&_nvp_node_lookup[i][j], 1, 1); + } + + pthread_spin_unlock(&node_lookup_lock[i]); + + free(_nvp_node_lookup[i]); + } + + for (i = 0; i < OPEN_MAX; i++) { + nvp_free_btree(_nvp_ino_mapping[i].root, + _nvp_ino_mapping[i].merkle_root, + _nvp_ino_mapping[i].height, + _nvp_ino_mapping[i].root_dirty_cache, + _nvp_ino_mapping[i].root_dirty_num, + _nvp_ino_mapping[i].total_dirty_mmaps); + } + free(_nvp_ino_mapping); + + // DEBUG_FILE("%s: CLEANUP FINISHED\n", __func__); + // MSG("%s: Done Cleaning up\n", __func__); +} + +static void nvp_exit_handler(void) +{ + MSG("Exit: print stats\n"); + nvp_print_io_stats(); + PRINT_TIME(); + + MSG("calling cleanup\n"); + DEBUG_FILE("%s: CLEANUP STARTED\n", __func__); + nvp_cleanup(); +} + +static void _nvp_SIGUSR1_handler(int sig) +{ + MSG("SIGUSR1: print stats\n"); + //nvp_print_time_stats(); + nvp_print_io_stats(); + PRINT_TIME(); +} + +static void _nvp_SIGBUS_handler(int sig) +{ + ERROR("We got a SIGBUS (sig %i)! " + "This almost certainly means someone tried to access an area " + "inside an mmaped region but past the length of the mmapped " + "file.\n", sig); + MSG("%s: sigbus got\n", __func__); + fflush(NULL); + + assert(0); +} + +void _mm_cache_flush(void const* p) { + asm volatile("clflush %0" : "+m" (*(volatile char *)(p))); +} + +void _mm_cache_flush_optimised(void const* p) { + asm volatile("clflushopt %0" : "+m" (*(volatile char *)(p))); +} + +// Figure out if CLFLUSHOPT is supported +int is_clflushopt_supported() { + unsigned int eax, ebx, ecx, edx; + __cpuid_count(7, 0, eax, ebx, ecx, edx); + return ebx & bit_CLFLUSHOPT; +} + +void _initialize_splitfs(void) +{ + OPEN_MAX = 1024; + int i, j; + struct InodeToMapping *tempMapping; + + assert(!posix_memalign(((void**)&_nvp_zbuf), 4096, 4096)); + + _nvp_print_fd = fdopen(syscall_no_intercept(SYS_dup, 2), "a"); + MSG("Now printing on fd %p\n", _nvp_print_fd); + assert(_nvp_print_fd >= 0); + + /* + Based on availability of CLFLUSHOPT instruction, point _mm_flush to the + appropriate function + */ + if(is_clflushopt_supported()) { + MSG("CLFLUSHOPT is supported!\n"); + _mm_flush = _mm_cache_flush_optimised; + } else { + MSG("CLFLUSHOPT is not supported! Using CLFLUSH \n"); + _mm_flush = _mm_cache_flush; + } + +#if WORKLOAD_TAR | WORKLOAD_GIT | WORKLOAD_RSYNC + ASYNC_CLOSING = 0; +#else + ASYNC_CLOSING = 1; +#endif // WORKLOAD_TAR + + /* + * Allocating and Initializing NVFiles. Total number of NVFiles = 1024. + * _nvp_fd_lookup is an array of struct NVFile + */ + _nvp_fd_lookup = (struct NVFile*)calloc(OPEN_MAX, + sizeof(struct NVFile)); + // Allocating intercept lookup table. If true then SplitFS handles the file, else passed on to ext4. + _fd_intercept_lookup = (bool *) calloc(OPEN_MAX, sizeof(bool)); + if (!_nvp_fd_lookup || !_fd_intercept_lookup) + assert(0); + // Initializing the valid bits and locks of each NVFile + for(i = 0; i < OPEN_MAX; i++) { + _nvp_fd_lookup[i].valid = 0; + NVP_LOCK_INIT(_nvp_fd_lookup[i].lock); + } + /* Initializing the closed file descriptor array */ + _nvp_closed_files = (struct ClosedFiles*)calloc(TOTAL_CLOSED_INODES, sizeof(struct ClosedFiles)); + for(i = 0; i < TOTAL_CLOSED_INODES; i++) { + _nvp_closed_files[i].fd = -1; + _nvp_closed_files[i].serialno = 0; + _nvp_closed_files[i].index_in_free_list = -1; + _nvp_closed_files[i].next_closed_file = -1; + _nvp_closed_files[i].prev_closed_file = -1; + NVP_LOCK_INIT(_nvp_closed_files[i].lock); + } + if(!_nvp_closed_files) + assert(0); + + /* Initialize and allocate hash table for closed file descriptor array */ + inode_to_closed_file = (struct InodeClosedFile *)calloc(OPEN_MAX, sizeof(struct InodeClosedFile)); + for(i = 0; i < OPEN_MAX; i++) { + inode_to_closed_file[i].index = -1; + NVP_LOCK_INIT(inode_to_closed_file[i].lock); + } + if(!inode_to_closed_file) + assert(0); + + lru_head = -1; + lru_tail = -1; + lru_tail_serialno = 0; + + /* + Allocate and initialize the free list for nodes + */ + for (i = 0; i < NUM_NODE_LISTS; i++) { + _nvp_free_node_list[i] = (struct StackNode*)calloc(OPEN_MAX, + sizeof(struct StackNode)); + for(j = 0; j < OPEN_MAX; j++) { + _nvp_free_node_list[i][j].free_bit = 1; + _nvp_free_node_list[i][j].next_free_idx = j+1; + } + _nvp_free_node_list[i][OPEN_MAX - 1].next_free_idx = -1; + } + + _nvp_free_lru_list = (struct StackNode*)calloc(OPEN_MAX, + sizeof(struct StackNode)); + for(i = 0; i < OPEN_MAX; i++) { + _nvp_free_lru_list[i].free_bit = 1; + _nvp_free_lru_list[i].next_free_idx = i+1; + } + _nvp_free_lru_list[OPEN_MAX - 1].next_free_idx = -1; + for (i = 0; i < NUM_NODE_LISTS; i++) { + if (!_nvp_free_node_list[i]) + assert(0); + } + if(!_nvp_free_lru_list) + assert(0); + for (i = 0; i < NUM_NODE_LISTS; i++) { + _nvp_free_node_list_head[i] = 0; + } + _nvp_free_lru_list_head = 0; + /* + Allocating and Initializing mmap cache. Can hold mmaps, merkle trees and dirty mmap caches belonging to 1024 files. _nvp_ino_mapping is an array of struct InodeToMapping + */ + _nvp_ino_mapping = (struct InodeToMapping*)calloc(OPEN_MAX, sizeof(struct InodeToMapping)); + memset((void *)_nvp_ino_mapping, 0, OPEN_MAX * sizeof(struct InodeToMapping)); + if (!_nvp_ino_mapping) + assert(0); + for(i=0; iroot = malloc(1024 * sizeof(unsigned long)); + memset((void *)tempMapping->root, 0, 1024 * sizeof(unsigned long)); + + tempMapping->merkle_root = malloc(1024 * sizeof(unsigned long)); + memset((void *)tempMapping->merkle_root, 0, 1024 * sizeof(unsigned long)); + + // Allocating region to store dirty mmap caches + tempMapping->root_dirty_cache = malloc(20 * sizeof(unsigned long)); + memset((void *)tempMapping->root_dirty_cache, 0, 20 * sizeof(unsigned long)); + + tempMapping->root_dirty_num = 0; + tempMapping->total_dirty_mmaps = 0; + + // Initializing the inode numbers = keys to 0 + _nvp_ino_mapping[i].serialno = 0; + } + /* + * Allocating and Initializing NVNode. Number of NVNodes = 1024. + * _nvp_node_lookup is an array of struct NVNode + */ + for (i = 0; i < NUM_NODE_LISTS; i++) { + _nvp_node_lookup[i] = (struct NVNode*)calloc(OPEN_MAX, + sizeof(struct NVNode)); + if (!_nvp_node_lookup[i]) + assert(0); + + _nvp_backup_roots[i] = (struct backupRoots*)calloc(OPEN_MAX, + sizeof(struct backupRoots)); + if (!_nvp_backup_roots[i]) + assert(0); + + + memset((void *)_nvp_node_lookup[i], 0, OPEN_MAX * sizeof(struct NVNode)); + // Allocating and initializing all the dynamic structs inside struct NVNode + for(j = 0; j < OPEN_MAX; j++) { + // Initializing lock associated with NVNode + NVP_LOCK_INIT(_nvp_node_lookup[i][j].lock); + + // Allocating and Initializing mmap() roots associated with NVNode + _nvp_node_lookup[i][j].root = malloc(1024 * sizeof(unsigned long)); + memset((void *)_nvp_node_lookup[i][j].root, 0, 1024 * sizeof(unsigned long)); + + // Allocating and Initializing merkle tree roots associated with NVNode + _nvp_node_lookup[i][j].merkle_root = malloc(1024 * sizeof(unsigned long)); + memset((void *)_nvp_node_lookup[i][j].merkle_root, 0, 1024 * sizeof(unsigned long)); + + // Allocating and Initializing the dirty mmap cache associated with NVNode + _nvp_node_lookup[i][j].root_dirty_cache = malloc(20 * sizeof(unsigned long)); + memset((void *)_nvp_node_lookup[i][j].root_dirty_cache, 0, 20 * sizeof(unsigned long)); + + _nvp_node_lookup[i][j].root_dirty_num = 0; + _nvp_node_lookup[i][j].total_dirty_mmaps = 0; + + // Allocating and Initializing DR root of the node + memset((void *)&_nvp_node_lookup[i][j].dr_info, 0, sizeof(struct free_dr_pool)); + + _nvp_backup_roots[i][j].root = _nvp_node_lookup[i][j].root; + _nvp_backup_roots[i][j].merkle_root = _nvp_node_lookup[i][j].merkle_root; + _nvp_backup_roots[i][j].root_dirty_cache = _nvp_node_lookup[i][j].root_dirty_cache; + + } + } + + /* + Allocating and Initializing the free pool of DR mmap()s. Total number of mmap()s allowed = 1024. + */ + //lfds711_queue_umm_init_valid_on_current_logical_core( &qs, &qe_dummy, NULL ); + lfq_init(&staging_mmap_queue_ctx, NVP_NUM_LOCKS/2); + +#if DATA_JOURNALING_ENABLED + + //lfds711_queue_umm_init_valid_on_current_logical_core( &qs_over, &qe_dummy_over, NULL ); + lfq_init(&staging_over_mmap_queue_ctx, NVP_NUM_LOCKS/2); + +#endif + + MMAP_PAGE_SIZE = getpagesize(); + MMAP_HUGEPAGE_SIZE = 2097152; + + init_append_log(); +#if !POSIX_ENABLED + init_op_log(); +#endif + + struct free_dr_pool *free_pool_mmaps; + char prefault_buf[MMAP_PAGE_SIZE]; + char dr_fname[256]; + int dr_fd, ret; + struct stat stat_buf; + int max_perms = PROT_READ | PROT_WRITE; + int num_dr_blocks = DR_SIZE / MMAP_PAGE_SIZE; + free_pool_mmaps = (struct free_dr_pool *) malloc(sizeof(struct free_dr_pool)*INIT_NUM_DR); + for (i = 0; i < MMAP_PAGE_SIZE; i++) + prefault_buf[i] = '0'; + + for (i = 0; i < INIT_NUM_DR; i++) { + sprintf(dr_fname, "%s%s", NVMM_PATH, "DR-XXXXXX"); + dr_fd = syscall_no_intercept(SYS_open, mktemp(dr_fname), O_RDWR | O_CREAT, 0666); + if (dr_fd < 0) { + MSG("%s: mkstemp of DR file failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + ret = posix_fallocate(dr_fd, 0, DR_SIZE); + if (ret < 0) { + MSG("%s: posix_fallocate failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + num_mmap++; + num_drs++; + free_pool_mmaps[i].start_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + DR_SIZE, + max_perms, //max_perms, + MAP_SHARED | MAP_POPULATE, + dr_fd, //fd_with_max_perms, + 0 + ); + fstat(dr_fd, &stat_buf); + free_pool_mmaps[i].dr_serialno = stat_buf.st_ino; + free_pool_mmaps[i].dr_fd = dr_fd; + free_pool_mmaps[i].valid_offset = 0; + free_pool_mmaps[i].dr_offset_start = DR_SIZE; + free_pool_mmaps[i].dr_offset_end = free_pool_mmaps[i].valid_offset; + + for (j = 0; j < num_dr_blocks; j++) { +#if NON_TEMPORAL_WRITES + + if(MEMCPY_NON_TEMPORAL((char *)free_pool_mmaps[i].start_addr + j*MMAP_PAGE_SIZE, prefault_buf, MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy failed\n", __func__); + assert(0); + } + +#else + + if(FSYNC_MEMCPY((char *)free_pool_mmaps[i].start_addr + j*MMAP_PAGE_SIZE, prefault_buf, MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy failed\n", __func__); + assert(0); + } + +#endif // NON_TEMPORAL_WRITES + +#if NVM_DELAY + + perfmodel_add_delay(0, MMAP_PAGE_SIZE); + +#endif //NVM_DELAY + + } + + //LFDS711_QUEUE_UMM_SET_VALUE_IN_ELEMENT(free_pool_mmaps[i].qe, + // &free_pool_mmaps[i] ); + + //lfds711_queue_umm_enqueue( &qs, &free_pool_mmaps[i].qe ); + if (lfq_enqueue(&staging_mmap_queue_ctx, &(free_pool_mmaps[i])) != 0) + assert(0); + + MSG("%s: dr fd = %d, start addr = %p\n", __func__, dr_fd, + free_pool_mmaps[i].start_addr); + dr_fname[0] = '\0'; + num_drs_left++; + } + +#if DATA_JOURNALING_ENABLED + + int num_dr_over_blocks = DR_OVER_SIZE / MMAP_PAGE_SIZE; + free_pool_mmaps = NULL; + free_pool_mmaps = (struct free_dr_pool *) malloc(sizeof(struct free_dr_pool)*INIT_NUM_DR_OVER); + for (i = 0; i < MMAP_PAGE_SIZE; i++) + prefault_buf[i] = '0'; + + for (i = 0; i < INIT_NUM_DR_OVER; i++) { + sprintf(dr_fname, "%s%s", NVMM_PATH, "DR-OVER-XXXXXX"); + dr_fd = syscall_no_intercept(SYS_open, mktemp(dr_fname), O_RDWR | O_CREAT, 0666); + if (dr_fd < 0) { + MSG("%s: mkstemp of DR file failed. Err = %s\n", + __func__, strerror(dr_fd)); + assert(0); + } + ret = posix_fallocate(dr_fd, 0, DR_OVER_SIZE); + if (ret < 0) { + MSG("%s: posix_fallocate failed. Err = %s\n", + __func__, strerror(errno)); + assert(0); + } + num_mmap++; + num_drs++; + free_pool_mmaps[i].start_addr = (unsigned long) FSYNC_MMAP + ( + NULL, + DR_OVER_SIZE, + max_perms, //max_perms, + MAP_SHARED | MAP_POPULATE, + dr_fd, //fd_with_max_perms, + 0 + ); + syscall_no_intercept(SYS_fstat, dr_fd, &stat_buf); + free_pool_mmaps[i].dr_serialno = stat_buf.st_ino; + free_pool_mmaps[i].dr_fd = dr_fd; + free_pool_mmaps[i].valid_offset = 0; + free_pool_mmaps[i].dr_offset_start = free_pool_mmaps[i].valid_offset; + free_pool_mmaps[i].dr_offset_end = DR_OVER_SIZE; + + for (j = 0; j < num_dr_over_blocks; j++) { + +#if NON_TEMPORAL_WRITES + + if(MEMCPY_NON_TEMPORAL((char *)free_pool_mmaps[i].start_addr + j*MMAP_PAGE_SIZE, prefault_buf, MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy failed\n", __func__); + assert(0); + } + +#else // NON_TEMPORAL_WRITES + + if(FSYNC_MEMCPY((char *)free_pool_mmaps[i].start_addr + j*MMAP_PAGE_SIZE, prefault_buf, MMAP_PAGE_SIZE) == NULL) { + MSG("%s: non-temporal memcpy failed\n", __func__); + assert(0); + } + +#endif // NON_TEMPORAL_WRITES + +#if NVM_DELAY + + perfmodel_add_delay(0, MMAP_PAGE_SIZE); + +#endif //NVM_DELAY + + } + + //LFDS711_QUEUE_UMM_SET_VALUE_IN_ELEMENT(free_pool_mmaps[i].qe, + // &free_pool_mmaps[i] ); + + if (lfq_enqueue(&staging_over_mmap_queue_ctx, &(free_pool_mmaps[i])) != 0) + assert(0); + //lfds711_queue_umm_enqueue( &qs_over, &free_pool_mmaps[i].qe ); + + MSG("%s: dr fd = %d, start addr = %p\n", __func__, dr_fd, + free_pool_mmaps[i].start_addr); + dr_fname[0] = '\0'; + num_drs_left++; + } + + // Creating array of full DRs to dispose at process end time. + _nvp_full_drs = (struct full_dr *) malloc(1024*sizeof(struct full_dr)); + memset((void *) _nvp_full_drs, 0, 1024*sizeof(struct full_dr)); + full_dr_idx = 0; + + _nvp_tbl_regions = (struct NVLarge_maps *) malloc(LARGE_TBL_MAX*sizeof(struct NVLarge_maps)); + memset((void *) _nvp_tbl_regions, 0, LARGE_TBL_MAX*sizeof(struct NVLarge_maps)); + for (i = 0; i < LARGE_TBL_MAX; i++) { + _nvp_tbl_regions[i].regions = (struct NVTable_regions *) malloc(LARGE_TBL_REGIONS*sizeof(struct NVTable_regions)); + memset((void *) _nvp_tbl_regions[i].regions, 0, LARGE_TBL_REGIONS*sizeof(struct NVTable_regions)); + for (j = 0; j < LARGE_TBL_REGIONS; j++) { + _nvp_tbl_regions[i].regions[j].tbl_mmaps = (struct table_mmaps *) malloc(PER_REGION_TABLES*sizeof(struct table_mmaps)); + _nvp_tbl_regions[i].regions[j].lowest_off = (REGION_COVERAGE)*(j + 1); + _nvp_tbl_regions[i].regions[j].highest_off = 0; + memset((void *) _nvp_tbl_regions[i].regions[j].tbl_mmaps, 0, PER_REGION_TABLES*sizeof(struct table_mmaps)); + } + _nvp_tbl_regions[i].min_dirty_region = LARGE_TBL_REGIONS; + _nvp_tbl_regions[i].max_dirty_region = 0; + } + + MSG("%s: Large regions set\n", __func__); + + _nvp_over_tbl_mmaps = (struct NVTable_maps *) malloc(OVER_TBL_MAX*sizeof(struct NVTable_maps)); + for (i = 0; i < OVER_TBL_MAX; i++) { + _nvp_over_tbl_mmaps[i].tbl_mmaps = (struct table_mmaps *) malloc(NUM_OVER_TBL_MMAP_ENTRIES*sizeof(struct table_mmaps)); + memset((void *)_nvp_over_tbl_mmaps[i].tbl_mmaps, 0, NUM_OVER_TBL_MMAP_ENTRIES*sizeof(struct table_mmaps)); + _nvp_over_tbl_mmaps[i].tbl_mmap_index = 0; + NVP_LOCK_INIT(_nvp_over_tbl_mmaps[i].lock); + } + + MSG("%s: Tbl over mmaps set\n", __func__); + +#endif // DATA_JOURNALING_ENABLED + + _nvp_tbl_mmaps = (struct NVTable_maps *) malloc(APPEND_TBL_MAX*sizeof(struct NVTable_maps)); + for (i = 0; i < APPEND_TBL_MAX; i++) { + _nvp_tbl_mmaps[i].tbl_mmaps = (struct table_mmaps *) malloc(NUM_APP_TBL_MMAP_ENTRIES*sizeof(struct table_mmaps)); + memset((void *)_nvp_tbl_mmaps[i].tbl_mmaps, 0, NUM_APP_TBL_MMAP_ENTRIES*sizeof(struct table_mmaps)); + _nvp_tbl_mmaps[i].tbl_mmap_index = 0; + NVP_LOCK_INIT(_nvp_tbl_mmaps[i].lock); + } + + MSG("%s: Tbl mmaps set\n", __func__); + + // Initializing global lock for accessing NVNode + for (i = 0; i < NUM_NODE_LISTS; i++) { + pthread_spin_init(&node_lookup_lock[i], PTHREAD_PROCESS_SHARED); + } + pthread_spin_init(&global_lock, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&global_lock_closed_files, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&global_lock_lru_head, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&stack_lock, PTHREAD_PROCESS_SHARED); + + MSG("%s: Global locks created\n", __func__); + + SANITYCHECK(MMAP_PAGE_SIZE > 100); + INITIALIZE_TIMERS(); + /* + Setting up variables and initialization for background thread + */ + cleanup = 0; + + waiting_for_signal = 0; + started_bgthread = 0; + exit_bgthread = 0; + waiting_for_cleaning_signal = 0; + started_bg_cleaning_thread = 0; + exit_bg_cleaning_thread = 0; + + lim_num_files = 100; + lim_dr_mem = (5ULL) * 1024 * 1024 * 1024; + lim_dr_mem_closed = 500 * 1024 * 1024; + run_background_thread = 0; + initEnvForBg(); + initEnvForBgClean(); + MSG("%s: initialized environment, OPEN_MAX = %d\n", __func__, OPEN_MAX); + dr_mem_allocated = 0; + dr_mem_closed_files = 0; +#if BG_CLOSING + calledBgThread = 0; + startBgThread(); +#endif +#if BG_CLEANING + calledBgCleaningThread = 0; + startBgCleaningThread(); +#endif + /* + * Setting up signal handlers: SIGBUS and SIGUSR + */ + DEBUG("Installing signal handler.\n"); + signal(SIGBUS, _nvp_SIGBUS_handler); + /* For filebench */ + signal(SIGUSR1, _nvp_SIGUSR1_handler); + /* + Setting up the exit handler to print stats + */ + atexit(nvp_exit_handler); + + int pid = getpid(); + char exec_nvp_filename[BUF_SIZE]; + + sprintf(exec_nvp_filename, "/dev/shm/exec-ledger-%d", pid); + if (access(exec_nvp_filename, F_OK ) != -1) { + _sfs_SHM_COPY(); + } +} + +void _init_hook_arr() { + syscall_hook_arr[SYS_open] = &_sfs_OPEN; + syscall_hook_arr[SYS_close] = &_sfs_CLOSE; + syscall_hook_arr[SYS_read] = &_sfs_READ; + syscall_hook_arr[SYS_write] = &_sfs_WRITE; + syscall_hook_arr[SYS_lseek] = &_sfs_SEEK; + syscall_hook_arr[SYS_execve] = &_sfs_EXECVE; + syscall_hook_arr[SYS_fsync] = &_sfs_FSYNC; + syscall_hook_arr[SYS_dup] = &_sfs_DUP; + syscall_hook_arr[SYS_dup2] = &_sfs_DUP2; +#if !POSIX_ENABLED + syscall_hook_arr[SYS_mknod] = &_sfs_MKNOD; + syscall_hook_arr[SYS_mknodat] = &_sfs_MKNODAT; + syscall_hook_arr[SYS_mkdir] = &_sfs_MKDIR; + syscall_hook_arr[SYS_mkdirat] = &_sfs_MKDIRAT; + syscall_hook_arr[SYS_rename] = &_sfs_RENAME; + syscall_hook_arr[SYS_rmdir] = &_sfs_RMDIR; + syscall_hook_arr[SYS_link] = &_sfs_LINK; + syscall_hook_arr[SYS_symlink] = &_sfs_SYMLINK; + syscall_hook_arr[SYS_symlinkat] = &_sfs_SYMLINKAT; + syscall_hook_arr[SYS_unlink] = &_sfs_UNLINK; + syscall_hook_arr[SYS_unlinkat] = &_sfs_UNLINKAT; +#endif +} + +#if DEBUG_INTERCEPTIONS +bool visited[512]; +int sfd; +#endif + +static RETT_SYSCALL_INTERCEPT +hook(long syscall_number, INTF_SYSCALL) +{ + // If not defined then pass to kernel + if(syscall_hook_arr[syscall_number] == NULL) { + +#if DEBUG_INTERCEPTIONS + // Write to the file all the system calls that were not intercepted by SplitFS + int num = (int)syscall_number; + if(sfd == 0) { + sfd = syscall_no_intercept(SYS_open, "/tmp/sfs_unintercepted_syscalls.log", O_CREAT | O_RDWR | O_APPEND, 0644); + if(sfd <= 0) { + perror("error!"); + } + assert(sfd > 0); + } + if(!visited[num]) { + char buf[512]; + int len; + len = sprintf(buf, "%d\n", num); + syscall_no_intercept(SYS_write, sfd, buf, len); + visited[num] = true; + } +#endif + return RETT_PASS_KERN; + } + return syscall_hook_arr[syscall_number](arg0, arg1, arg2, arg3, arg4, arg5, result); +} + +static __attribute__((constructor)) void +init(void) +{ + _initialize_splitfs(); + _init_hook_arr(); + + // Set up the callback function + intercept_hook_point = hook; +} \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/stack.c b/splitfs_syscall_intercept/src/stack.c new file mode 100644 index 0000000000..383e454402 --- /dev/null +++ b/splitfs_syscall_intercept/src/stack.c @@ -0,0 +1,81 @@ +#include "stack.h" + +void push_in_stack(int free_node_list, int free_lru_list, int idx_in_list, int list_idx) { + + if (free_lru_list) + STACK_LOCK_WR(); + + if (free_node_list) { + _nvp_free_node_list[list_idx][idx_in_list].free_bit = 1; + _nvp_free_node_list[list_idx][idx_in_list].next_free_idx = _nvp_free_node_list_head[list_idx]; + _nvp_free_node_list_head[list_idx] = idx_in_list; + } else if (free_lru_list) { + _nvp_free_lru_list[idx_in_list].free_bit = 1; + _nvp_free_lru_list[idx_in_list].next_free_idx = _nvp_free_lru_list_head; + _nvp_free_lru_list_head = idx_in_list; + } else { + if (free_lru_list) + STACK_UNLOCK_WR(); + return; + } + + if (free_lru_list) + STACK_UNLOCK_WR(); +} + + +int pop_from_stack(int free_node_list, int free_lru_list, int list_idx) { + + int idx_in_list = -1; + int candidate = -1; + + if (free_lru_list) + STACK_LOCK_WR(); + + if (free_node_list) { + while (_nvp_free_node_list[list_idx][_nvp_free_node_list_head[list_idx]].free_bit != 1 && _nvp_free_node_list_head[list_idx] != -1) { + + if(candidate == -1 && _nvp_node_lookup[list_idx][_nvp_free_node_list_head[list_idx]].reference == 0) + candidate = _nvp_free_node_list_head[list_idx]; + + _nvp_free_node_list_head[list_idx] = _nvp_free_node_list[list_idx][_nvp_free_node_list_head[list_idx]].next_free_idx; + } + + if (_nvp_free_node_list_head[list_idx] == -1) + goto candidate_lookup; + + if (_nvp_free_node_list[list_idx][_nvp_free_node_list_head[list_idx]].free_bit == 1) { + idx_in_list = _nvp_free_node_list_head[list_idx]; + _nvp_free_node_list[list_idx][idx_in_list].free_bit = 0; + _nvp_free_node_list_head[list_idx] = _nvp_free_node_list[list_idx][idx_in_list].next_free_idx; + goto out; + } + + } else if (free_lru_list) { + while (_nvp_free_lru_list[_nvp_free_lru_list_head].free_bit != 1 && _nvp_free_lru_list_head != -1) + _nvp_free_lru_list_head = _nvp_free_lru_list[_nvp_free_lru_list_head].next_free_idx; + + if (_nvp_free_lru_list_head == -1) + return -1; + + idx_in_list = _nvp_free_lru_list_head; + _nvp_free_lru_list[idx_in_list].free_bit = 0; + _nvp_free_lru_list_head = _nvp_free_lru_list[idx_in_list].next_free_idx; + + goto out; + } + + candidate_lookup: + if (candidate != -1) { + _nvp_free_node_list[list_idx][candidate].free_bit = 0; + _nvp_free_node_list_head[list_idx] = _nvp_free_node_list[list_idx][candidate].next_free_idx; + } + + idx_in_list = candidate; + + out: + if (free_lru_list) + STACK_UNLOCK_WR(); + + return idx_in_list; +} diff --git a/splitfs_syscall_intercept/src/stack.h b/splitfs_syscall_intercept/src/stack.h new file mode 100644 index 0000000000..33b4d3b52f --- /dev/null +++ b/splitfs_syscall_intercept/src/stack.h @@ -0,0 +1,32 @@ +#ifndef LEDGER_SRC_STACK_H_ +#define LEDGER_SRC_STACK_H_ + +#include +#include "inode.h" + +#define STACK_LOCK_WR() {(void)(stack_lock);} +#define STACK_UNLOCK_WR() {(void)(stack_lock);} + +/* + * Declare the structure that will hold information of the files that are to be closed + */ +struct StackNode { + int free_bit; + int next_free_idx; +}; + +/* + * Global variables to hold the head and tail of LRU list + */ +struct StackNode *_nvp_free_node_list[NUM_NODE_LISTS]; +struct StackNode *_nvp_free_lru_list; +int _nvp_free_node_list_head[NUM_NODE_LISTS]; +int _nvp_free_lru_list_head; +struct NVNode *_nvp_node_lookup[NUM_NODE_LISTS]; +struct backupRoots *_nvp_backup_roots[NUM_NODE_LISTS]; +pthread_spinlock_t stack_lock; + +void push_in_stack(int free_node_list, int free_lru_list, int idx_in_list, int list_idx); +int pop_from_stack(int free_node_list, int free_lru_list, int list_idx); + +#endif diff --git a/splitfs_syscall_intercept/src/staging.c b/splitfs_syscall_intercept/src/staging.c new file mode 100644 index 0000000000..81dd40848a --- /dev/null +++ b/splitfs_syscall_intercept/src/staging.c @@ -0,0 +1,76 @@ +/* + * ===================================================================================== + * + * Filename: staging.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:44:43 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#include +#include "utils.h" +#include "staging.h" +#include "handle_mmaps.h" + +void nvp_transfer_to_free_dr_pool(struct NVNode *node) +{ + int i, num_free_dr_mmaps; + struct free_dr_pool *free_pool_of_dr_mmap; + unsigned long offset_in_page = 0; + +#if DATA_JOURNALING_ENABLED + + if(node->dr_over_info.start_addr != 0) { + free_pool_of_dr_mmap = (struct free_dr_pool *) malloc(sizeof(struct free_dr_pool)); + free_pool_of_dr_mmap->dr_offset_start = node->dr_over_info.dr_offset_start; + free_pool_of_dr_mmap->dr_offset_end = DR_OVER_SIZE; + free_pool_of_dr_mmap->start_addr = node->dr_over_info.start_addr; + free_pool_of_dr_mmap->dr_fd = node->dr_over_info.dr_fd; + free_pool_of_dr_mmap->dr_serialno = node->dr_over_info.dr_serialno; + free_pool_of_dr_mmap->valid_offset = node->dr_over_info.valid_offset; + + //LFDS711_QUEUE_UMM_SET_VALUE_IN_ELEMENT(free_pool_of_dr_mmap->qe, free_pool_of_dr_mmap); + //lfds711_queue_umm_enqueue( &qs_over, &(free_pool_of_dr_mmap->qe) ); + if (lfq_enqueue(&staging_over_mmap_queue_ctx, free_pool_of_dr_mmap) != 0) + assert(0); + + memset((void *)&node->dr_over_info, 0, sizeof(struct free_dr_pool)); + __atomic_fetch_sub(&dr_mem_allocated, DR_OVER_SIZE, __ATOMIC_SEQ_CST); + } + +#endif // DATA_JOURNALING_ENABLED + + if(node->dr_info.start_addr != 0) { + free_pool_of_dr_mmap = (struct free_dr_pool *) malloc(sizeof(struct free_dr_pool)); + + node->dr_info.valid_offset = align_cur_page(node->dr_info.valid_offset); + if (node->dr_info.valid_offset > DR_SIZE) + node->dr_info.valid_offset = DR_SIZE; + + free_pool_of_dr_mmap->dr_offset_start = DR_SIZE; + free_pool_of_dr_mmap->dr_offset_end = node->dr_info.valid_offset; + free_pool_of_dr_mmap->start_addr = node->dr_info.start_addr; + free_pool_of_dr_mmap->dr_fd = node->dr_info.dr_fd; + free_pool_of_dr_mmap->dr_serialno = node->dr_info.dr_serialno; + free_pool_of_dr_mmap->valid_offset = node->dr_info.valid_offset; + + //LFDS711_QUEUE_UMM_SET_VALUE_IN_ELEMENT(free_pool_of_dr_mmap->qe, free_pool_of_dr_mmap); + //lfds711_queue_umm_enqueue( &qs, &(free_pool_of_dr_mmap->qe) ); + if (lfq_enqueue(&staging_mmap_queue_ctx, free_pool_of_dr_mmap) != 0) + assert(0); + + memset((void *)&node->dr_info, 0, sizeof(struct free_dr_pool)); + __atomic_fetch_sub(&dr_mem_allocated, DR_SIZE, __ATOMIC_SEQ_CST); + + DEBUG_FILE("%s: staging inode = %lu. Inserted into global pool with valid offset = %lld\n", + __func__, free_pool_of_dr_mmap->dr_serialno, free_pool_of_dr_mmap->valid_offset); + } +} diff --git a/splitfs_syscall_intercept/src/staging.h b/splitfs_syscall_intercept/src/staging.h new file mode 100644 index 0000000000..5e3ec32960 --- /dev/null +++ b/splitfs_syscall_intercept/src/staging.h @@ -0,0 +1,58 @@ +/* + * ===================================================================================== + * + * Filename: staging.h + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 04:00:51 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +#ifndef SPLITFS_STAGING_H +#define SPLITFS_STAGING_H + +#include +#include "lfq.h" +// #include "liblfds711/inc/liblfds711.h" + +#define DR_APPEND_PATH "/mnt/pmem_emul/DR-XXXXXX" +#define DR_OVER_PATH "/mnt/pmem_emul/DR-OVER-XXXXXX" + +struct free_dr_pool +{ + unsigned long start_addr; + int dr_fd; + ino_t dr_serialno; + unsigned long valid_offset; + unsigned long dr_offset_start; + unsigned long dr_offset_end; +}; + +struct full_dr { + int dr_fd; + unsigned long start_addr; + size_t size; +}; + +#define INIT_NUM_DR 2 +#define INIT_NUM_DR_OVER 2 +#define BG_NUM_DR 1 + +#define DR_SIZE (256*1024*1024) +#define DR_OVER_SIZE (256*1024*1024) + +extern struct full_dr* _nvp_full_drs; +extern int full_dr_idx; + +struct lfq_ctx staging_mmap_queue_ctx; +struct lfq_ctx staging_over_mmap_queue_ctx; + +#endif + diff --git a/splitfs_syscall_intercept/src/tbl_mmaps.c b/splitfs_syscall_intercept/src/tbl_mmaps.c new file mode 100644 index 0000000000..4bfadc1b68 --- /dev/null +++ b/splitfs_syscall_intercept/src/tbl_mmaps.c @@ -0,0 +1,1204 @@ +#include "timers.h" +#include "tbl_mmaps.h" + +#if DATA_JOURNALING_ENABLED + +void get_lowest_tbl_elem(off_t *over_file_start, + off_t *over_file_end, + off_t *over_dr_start, + off_t *over_dr_end, + struct NVTable_maps *tbl, + int idx_in_over) +{ + *over_file_start = tbl->tbl_mmaps[idx_in_over].file_start_off; + *over_file_end = tbl->tbl_mmaps[idx_in_over].file_end_off; + *over_dr_start = tbl->tbl_mmaps[idx_in_over].dr_start_off; + *over_dr_end = tbl->tbl_mmaps[idx_in_over].dr_end_off; +} + +void get_tbl_elem_large(off_t *over_file_start, + off_t *over_file_end, + off_t *over_dr_start, + off_t *over_dr_end, + struct table_mmaps *tbl_mmaps, + int idx_in_over) +{ + *over_file_start = tbl_mmaps[idx_in_over].file_start_off; + *over_file_end = tbl_mmaps[idx_in_over].file_end_off; + *over_dr_start = tbl_mmaps[idx_in_over].dr_start_off; + *over_dr_end = tbl_mmaps[idx_in_over].dr_end_off; +} + + +int get_lowest_tbl_elem_large(off_t *over_file_start, + off_t *over_file_end, + off_t *over_dr_start, + off_t *over_dr_end, + struct table_mmaps *tbl_mmaps, + int tbl_mmap_index, + off_t max_value) +{ + off_t min_value = max_value; + int i = 0, idx_in_over = -1; + + for (i = 0; i < tbl_mmap_index; i++) { + if (tbl_mmaps[i].dr_end_off != 0 && tbl_mmaps[i].file_start_off < min_value) { + idx_in_over = i; + min_value = tbl_mmaps[i].file_start_off; + } + } + + if (idx_in_over != -1) { + *over_file_start = tbl_mmaps[idx_in_over].file_start_off; + *over_file_end = tbl_mmaps[idx_in_over].file_end_off; + *over_dr_start = tbl_mmaps[idx_in_over].dr_start_off; + *over_dr_end = tbl_mmaps[idx_in_over].dr_end_off; + return 1; + } + + return 0; +} + +static void shift_last_entry(int region, struct NVTable_regions *regions) +{ + int src_end_index = regions[region].tbl_mmap_index - 1; + + regions[region].tbl_mmaps[src_end_index+1].file_start_off = regions[region].tbl_mmaps[src_end_index].file_start_off; + regions[region].tbl_mmaps[src_end_index+1].dr_start_off = regions[region].tbl_mmaps[src_end_index].dr_start_off; + regions[region].tbl_mmaps[src_end_index+1].file_end_off = regions[region].tbl_mmaps[src_end_index].file_end_off; + regions[region].tbl_mmaps[src_end_index+1].dr_end_off = regions[region].tbl_mmaps[src_end_index].dr_end_off; + regions[region].tbl_mmaps[src_end_index+1].buf_start = regions[region].tbl_mmaps[src_end_index].buf_start; + + regions[region].tbl_mmap_index++; + +} + +static void remove_entry_from_region(int region, int tbl_idx, struct NVTable_regions *regions) +{ + int src_end_index = regions[region].tbl_mmap_index - 1; + int second_last_index = 0; + + if (src_end_index != 0) { + src_end_index--; + second_last_index = 1; + } + + regions[region].tbl_mmaps[tbl_idx].file_start_off = regions[region].tbl_mmaps[src_end_index].file_start_off; + regions[region].tbl_mmaps[tbl_idx].dr_start_off = regions[region].tbl_mmaps[src_end_index].dr_start_off; + regions[region].tbl_mmaps[tbl_idx].file_end_off = regions[region].tbl_mmaps[src_end_index].file_end_off; + regions[region].tbl_mmaps[tbl_idx].dr_end_off = regions[region].tbl_mmaps[src_end_index].dr_end_off; + regions[region].tbl_mmaps[tbl_idx].buf_start = regions[region].tbl_mmaps[src_end_index].buf_start; + + if (second_last_index == 1) { + regions[region].tbl_mmaps[src_end_index].file_start_off = regions[region].tbl_mmaps[src_end_index+1].file_start_off; + regions[region].tbl_mmaps[src_end_index].dr_start_off = regions[region].tbl_mmaps[src_end_index+1].dr_start_off; + regions[region].tbl_mmaps[src_end_index].file_end_off = regions[region].tbl_mmaps[src_end_index+1].file_end_off; + regions[region].tbl_mmaps[src_end_index].dr_end_off = regions[region].tbl_mmaps[src_end_index+1].dr_end_off; + regions[region].tbl_mmaps[src_end_index].buf_start = regions[region].tbl_mmaps[src_end_index+1].buf_start; + src_end_index++; + } + + memset((void *) ®ions[region].tbl_mmaps[src_end_index], 0, sizeof(struct table_mmaps)); + + regions[region].tbl_mmap_index--; +} + +static void exchange_regions(int from_region, int to_region, int tbl_idx, struct NVTable_regions *regions) +{ + int target_mmap_idx = regions[to_region].tbl_mmap_index; + int src_end_index = regions[from_region].tbl_mmap_index - 1; + + regions[to_region].tbl_mmaps[target_mmap_idx].file_start_off = regions[from_region].tbl_mmaps[tbl_idx].file_start_off; + regions[to_region].tbl_mmaps[target_mmap_idx].dr_start_off = regions[from_region].tbl_mmaps[tbl_idx].dr_start_off; + regions[to_region].tbl_mmaps[target_mmap_idx].file_end_off = regions[from_region].tbl_mmaps[tbl_idx].file_end_off; + regions[to_region].tbl_mmaps[target_mmap_idx].dr_end_off = regions[from_region].tbl_mmaps[tbl_idx].dr_end_off; + regions[to_region].tbl_mmaps[target_mmap_idx].buf_start = regions[from_region].tbl_mmaps[tbl_idx].buf_start; + + regions[to_region].tbl_mmap_index++; + + if (regions[to_region].lowest_off > regions[to_region].tbl_mmaps[target_mmap_idx].file_start_off) + regions[to_region].lowest_off = regions[to_region].tbl_mmaps[target_mmap_idx].file_start_off; + if (regions[to_region].highest_off < regions[to_region].tbl_mmaps[target_mmap_idx].file_end_off) + regions[to_region].highest_off = regions[to_region].tbl_mmaps[target_mmap_idx].file_end_off; + + remove_entry_from_region(from_region, tbl_idx, regions); +} + +static void set_lowest_and_highest(struct NVTable_regions *regions, int region_id) { + int num_entries = regions[region_id].tbl_mmap_index; + off_t lowest = (REGION_COVERAGE)*(region_id + 1), highest = 0; + int i = 0; + + for (i = 0; i < num_entries; i++) { + if (lowest > regions[region_id].tbl_mmaps[i].file_start_off) + lowest = regions[region_id].tbl_mmaps[i].file_start_off; + if (highest < regions[region_id].tbl_mmaps[i].file_end_off) + highest = regions[region_id].tbl_mmaps[i].file_end_off; + } + + regions[region_id].lowest_off = lowest; + regions[region_id].highest_off = highest; +} + +static int clear_overlapping_entry_large(off_t file_off_start, size_t length, struct NVTable_regions *regions) +{ + int region_id = file_off_start / REGION_COVERAGE; + int cur_region = region_id; + int i = 0, j = 0; + off_t file_off_end = file_off_start + length - 1; + int tbl_idx = 0; + int highest_region = 0, lowest_region = 0, adjusted_lower_region = 0, adjusted_higher_region = 0; + int num_added_regions = 0; + size_t shift_len = 0; + + if ((region_id - 1 >= 0) && (regions[region_id-1].highest_off >= file_off_start) && regions[region_id-1].tbl_mmap_index > 0) { + adjusted_lower_region = 1; + lowest_region = region_id - 1; + } else + lowest_region = region_id; + + if (regions[region_id+1].lowest_off <= file_off_end && regions[region_id+1].lowest_off > 0 && regions[region_id+1].tbl_mmap_index > 0) { + adjusted_higher_region = 1; + highest_region = region_id + 1; + } else + highest_region = region_id; + + cur_region = lowest_region; + while (cur_region <= highest_region) { + tbl_idx = regions[cur_region].tbl_mmap_index; + i = 0; + DEBUG_FILE("%s: STARTING TO CHECK OVERLAP. Cur_region = %d, num_entries in region = %d, lowest region = %d, highest region = %d\n", __func__, cur_region, tbl_idx, lowest_region, highest_region); + while (i < tbl_idx) { + if (regions[cur_region].tbl_mmaps[i].file_end_off > file_off_start) { + if (regions[cur_region].tbl_mmaps[i].file_start_off > file_off_start + length) + goto while_end; + if (regions[cur_region].tbl_mmaps[i].file_start_off == file_off_start) { + if (regions[cur_region].tbl_mmaps[i].file_end_off > file_off_end) { + /* + * |----------------|------------| + * tstart = fstart fend tend + */ + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + if (cur_region != region_id) { + assert(0); + } + + off_t prev_start_off = regions[cur_region].tbl_mmaps[i].file_start_off; + off_t prev_end_off = regions[cur_region].tbl_mmaps[i].file_end_off; + + shift_len = file_off_end + 1 - regions[cur_region].tbl_mmaps[i].file_start_off; + regions[cur_region].tbl_mmaps[i].file_start_off = file_off_end + 1; + regions[cur_region].tbl_mmaps[i].buf_start += shift_len; + regions[cur_region].tbl_mmaps[i].dr_start_off += shift_len; + + if ((regions[cur_region].tbl_mmaps[i].file_start_off / REGION_COVERAGE) != cur_region) { + if (regions[cur_region].highest_off == prev_end_off) + regions[cur_region].highest_off = file_off_end; + exchange_regions(cur_region, cur_region+1, i, regions); + } + + if (regions[cur_region].tbl_mmaps[i].dr_end_off < 0) { + MSG("%s: i = %d, tbl idx = %d. tbl fstart = %lld, tbl fend = %lld, tbl dr start = %lld, tbl dr end = %lld. Shift len = %lu. file fstart = %lld, file fend = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, regions[cur_region].tbl_mmaps[i].dr_start_off, regions[cur_region].tbl_mmaps[i].dr_end_off, shift_len, file_off_start, file_off_end); + assert(0); + } + + DEBUG_FILE("%s: OVERLAPPING HANDLED. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + goto end; + //break; + } else if (regions[cur_region].tbl_mmaps[i].file_end_off < file_off_end) { + /* + * |-----------------|----------| + * tstart = fstart tend fend + */ + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + if (cur_region != region_id) { + assert(0); + } + + // Remove element from the region + if (regions[cur_region].highest_off == regions[cur_region].tbl_mmaps[i].file_end_off) + regions[cur_region].highest_off = file_off_end; + + remove_entry_from_region(cur_region, i, regions); + tbl_idx--; + + DEBUG_FILE("%s: OVERLAPPING HANDLED. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + i--; + num_added_regions--; + + goto while_end; + } else { + /* + * |----------------------| + * tstart = fstart tend = fend + */ + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + if (cur_region != region_id) { + assert(0); + } + + // Remove element from the region + remove_entry_from_region(cur_region, i, regions); + tbl_idx--; + + DEBUG_FILE("%s: OVERLAPPING HANDLED. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + i--; + num_added_regions--; + + goto end; + //break; + } + } else if (regions[cur_region].tbl_mmaps[i].file_start_off < file_off_start) { + if (regions[cur_region].tbl_mmaps[i].file_end_off > file_off_end) { + /* + * |-------|------------|----------| + * tstart fstart fend tend + */ + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + int end_index = 0; + shift_last_entry(cur_region, regions); + if (i != regions[cur_region].tbl_mmap_index - 2) { + end_index = regions[cur_region].tbl_mmap_index - 2; + } else { + end_index = regions[cur_region].tbl_mmap_index - 1; + } + + shift_len = file_off_end + 1 - regions[cur_region].tbl_mmaps[i].file_start_off; + regions[cur_region].tbl_mmaps[end_index].file_start_off = file_off_end + 1; + regions[cur_region].tbl_mmaps[end_index].dr_start_off = regions[cur_region].tbl_mmaps[i].dr_start_off + shift_len; + regions[cur_region].tbl_mmaps[end_index].file_end_off = regions[cur_region].tbl_mmaps[i].file_end_off; + regions[cur_region].tbl_mmaps[end_index].dr_end_off = regions[cur_region].tbl_mmaps[i].dr_end_off; + regions[cur_region].tbl_mmaps[end_index].buf_start = regions[cur_region].tbl_mmaps[i].buf_start + shift_len; + + shift_len = regions[cur_region].tbl_mmaps[i].file_end_off - file_off_start + 1; + regions[cur_region].tbl_mmaps[i].file_end_off = file_off_start - 1; + if (regions[cur_region].tbl_mmaps[i].dr_end_off != 0) + regions[cur_region].tbl_mmaps[i].dr_end_off -= shift_len; + + if (regions[cur_region].tbl_mmaps[i].dr_end_off < 0) + assert(0); + + if ((regions[cur_region].tbl_mmaps[end_index].file_start_off / REGION_COVERAGE) != cur_region) { + if (regions[cur_region].highest_off == regions[cur_region].tbl_mmaps[end_index].file_end_off) + regions[cur_region].highest_off = file_off_start - 1; + exchange_regions(cur_region, cur_region+1, end_index, regions); + } + + num_added_regions++; + tbl_idx++; + + DEBUG_FILE("%s: OVERLAPPING HANDLED. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + goto end; + //break; + } else if (regions[cur_region].tbl_mmaps[i].file_end_off < file_off_end) { + /* + * |-------|---------------|------------| + * tstart fstart tend fend + */ + + if (regions[cur_region].highest_off == regions[cur_region].tbl_mmaps[i].file_end_off) + regions[cur_region].highest_off = file_off_start - 1; + + shift_len = regions[cur_region].tbl_mmaps[i].file_end_off - file_off_start + 1; + regions[cur_region].tbl_mmaps[i].file_end_off = file_off_start - 1; + if (regions[cur_region].tbl_mmaps[i].dr_end_off != 0) + regions[cur_region].tbl_mmaps[i].dr_end_off -= shift_len; + + if (regions[cur_region].tbl_mmaps[i].dr_end_off < 0) + assert(0); + + DEBUG_FILE("%s: OVERLAPPING HANDLED.\n", __func__); + + goto while_end; + } else { + /* + * |------|----------------| + * tstart fstart tend = fend + */ + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + if (regions[cur_region].highest_off == regions[cur_region].tbl_mmaps[i].file_end_off) + regions[cur_region].highest_off = file_off_start - 1; + + shift_len = regions[cur_region].tbl_mmaps[i].file_end_off - file_off_start + 1; + regions[cur_region].tbl_mmaps[i].file_end_off = file_off_start - 1; + if (regions[cur_region].tbl_mmaps[i].dr_end_off != 0) + regions[cur_region].tbl_mmaps[i].dr_end_off -= shift_len; + + if (regions[cur_region].tbl_mmaps[i].dr_end_off < 0) + assert(0); + + DEBUG_FILE("%s: OVERLAPPING HANDLED. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + goto end; + //break; + } + } else if (regions[cur_region].tbl_mmaps[i].file_start_off > file_off_start) { + if (regions[cur_region].tbl_mmaps[i].file_end_off > file_off_end) { + /* + * |-------|------------|-------------| + * fstart tstart fend tend + */ + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + off_t prev_start_off = regions[cur_region].tbl_mmaps[i].file_start_off; + off_t prev_end_off = regions[cur_region].tbl_mmaps[i].file_end_off; + + shift_len = file_off_end + 1 - regions[cur_region].tbl_mmaps[i].file_start_off; + regions[cur_region].tbl_mmaps[i].file_start_off = file_off_end + 1; + regions[cur_region].tbl_mmaps[i].dr_start_off += shift_len; + regions[cur_region].tbl_mmaps[i].buf_start += shift_len; + + if (regions[cur_region].tbl_mmaps[i].dr_end_off < 0) + assert(0); + + if ((regions[cur_region].tbl_mmaps[i].file_start_off / REGION_COVERAGE) != cur_region) { + if (regions[cur_region].lowest_off == prev_start_off) + regions[cur_region].lowest_off = file_off_start; + if (regions[cur_region].highest_off == prev_end_off) + regions[cur_region].highest_off = file_off_end; + exchange_regions(cur_region, cur_region+1, i, regions); + } else { + if (regions[cur_region].lowest_off == prev_start_off) + regions[cur_region].lowest_off = file_off_end + 1; + } + + DEBUG_FILE("%s: OVERLAPPING HANDLED. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + goto while_end; + //break; + } else if (regions[cur_region].tbl_mmaps[i].file_end_off < file_off_end) { + /* + * |-------|---------------|-----------| + * fstart tstart tend fend + */ + // Remove element from the region + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + if (cur_region != region_id) { + if (regions[cur_region].lowest_off == regions[cur_region].tbl_mmaps[i].file_start_off) + regions[cur_region].lowest_off = regions[cur_region].tbl_mmaps[i].file_end_off + 1; + if (regions[cur_region].highest_off == regions[cur_region].tbl_mmaps[i].file_end_off) + regions[cur_region].highest_off = regions[cur_region].tbl_mmaps[i].file_start_off - 1; + } else { + if (regions[cur_region].lowest_off == regions[cur_region].tbl_mmaps[i].file_start_off) + regions[cur_region].lowest_off = file_off_start; + if (regions[cur_region].highest_off == regions[cur_region].tbl_mmaps[i].file_end_off) + regions[cur_region].highest_off = file_off_end; + } + + remove_entry_from_region(cur_region, i, regions); + tbl_idx--; + + DEBUG_FILE("%s: OVERLAPPING HANDLED. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + i--; + num_added_regions--; + + goto while_end; + } else { + /* + * |------|----------------| + * fstart tstart tend = fend + */ + // Remove element from the region + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + if (cur_region != region_id) { + if (regions[cur_region].lowest_off == regions[cur_region].tbl_mmaps[i].file_start_off) + regions[cur_region].lowest_off = regions[cur_region].tbl_mmaps[i].file_end_off + 1; + if (regions[cur_region].highest_off == regions[cur_region].tbl_mmaps[i].file_end_off) + regions[cur_region].highest_off = regions[cur_region].tbl_mmaps[i].file_start_off - 1; + } else { + if (regions[cur_region].lowest_off == regions[cur_region].tbl_mmaps[i].file_start_off) + regions[cur_region].lowest_off = file_off_start; + if (regions[cur_region].highest_off == regions[cur_region].tbl_mmaps[i].file_end_off) + regions[cur_region].highest_off = file_off_end; + } + + remove_entry_from_region(cur_region, i, regions); + tbl_idx--; + + DEBUG_FILE("%s: OVERLAPPING HANDLED. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, i, tbl_idx, regions[cur_region].tbl_mmaps[i].file_start_off, regions[cur_region].tbl_mmaps[i].file_end_off, file_off_start, file_off_end); + + i--; + num_added_regions--; + + goto while_end; + //break; + } + } else { + MSG("%s: Weird values in table and file offsets\n", __func__); + assert(0); + } + } + while_end: + i++; + } + cur_region++; + } + + end: + if (adjusted_lower_region) + set_lowest_and_highest(regions, region_id - 1); + if (adjusted_higher_region) + set_lowest_and_highest(regions, region_id + 1); + set_lowest_and_highest(regions, region_id); + + return num_added_regions; +} + + + +static void clear_overlapping_entry(off_t file_off_start, + size_t length, + struct NVTable_maps *tbl) +{ + int tbl_idx = tbl->tbl_mmap_index; + int i = 0, j = 0, idx_bin = 0; + int left = 0, right = tbl->tbl_mmap_index - 1, mid = 0; + off_t file_off_end = file_off_start + length - 1; + off_t shift_len = 0; + int handle_overlaps = 0; + + if (right >= NUM_OVER_TBL_MMAP_ENTRIES - 1) + assert(0); + + if (right < left) { + return; + } + + bin_search: + while (left <= right) { + mid = (right + left) / 2; + + if (tbl->tbl_mmaps[mid].file_end_off > file_off_start) { + if (tbl->tbl_mmaps[mid].file_start_off >= file_off_start + length) { + right = mid - 1; + continue; + } + if (tbl->tbl_mmaps[mid].file_start_off < file_off_start + length) { + if (tbl->tbl_mmaps[mid].file_start_off == file_off_start) { + if (tbl->tbl_mmaps[mid].file_end_off > file_off_end) { + handle_overlaps = 1; + break; + } + else if (tbl->tbl_mmaps[mid].file_end_off < file_off_end) { + handle_overlaps = 2; + break; + } + else { + handle_overlaps = 3; + break; + } + } else if (tbl->tbl_mmaps[mid].file_start_off < file_off_start) { + if (tbl->tbl_mmaps[mid].file_end_off > file_off_end) { + handle_overlaps = 4; + break; + } + else if (tbl->tbl_mmaps[mid].file_end_off < file_off_end) { + handle_overlaps = 5; + break; + } + else { + handle_overlaps = 6; + break; + } + } else if (tbl->tbl_mmaps[mid].file_start_off > file_off_start) { + if (tbl->tbl_mmaps[mid].file_end_off > file_off_end) { + handle_overlaps = 7; + break; + } + else if (tbl->tbl_mmaps[mid].file_end_off < file_off_end) { + handle_overlaps = 8; + break; + } + else { + handle_overlaps = 9; + break; + } + } else { + assert(0); + } + } + } else { + left = mid + 1; + continue; + } + } + + if (left > right) + return; + + switch (handle_overlaps) { + + case 1: + /* + * |----------------|------------| + * tstart = fstart fend tend + */ + shift_len = file_off_end + 1 - tbl->tbl_mmaps[mid].file_start_off; + tbl->tbl_mmaps[mid].file_start_off = file_off_end + 1; + tbl->tbl_mmaps[mid].buf_start += shift_len; + tbl->tbl_mmaps[mid].dr_start_off += shift_len; + + if (tbl->tbl_mmaps[mid].dr_end_off < 0) { + MSG("%s: i = %d, tbl idx = %d. tbl fstart = %lld, tbl fend = %lld, tbl dr start = %lld, tbl dr end = %lld. Shift len = %lu. file fstart = %lld, file fend = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, tbl->tbl_mmaps[mid].dr_start_off, tbl->tbl_mmaps[mid].dr_end_off, shift_len, file_off_start, file_off_end); + assert(0); + } + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, file_off_start, file_off_end); + return; + + case 2: + /* + * |-----------------|----------| + * tstart = fstart tend fend + */ + for (j = mid; j < tbl->tbl_mmap_index-1; j++) { + tbl->tbl_mmaps[j].file_start_off = tbl->tbl_mmaps[j+1].file_start_off; + tbl->tbl_mmaps[j].dr_start_off = tbl->tbl_mmaps[j+1].dr_start_off; + tbl->tbl_mmaps[j].file_end_off = tbl->tbl_mmaps[j+1].file_end_off; + tbl->tbl_mmaps[j].dr_end_off = tbl->tbl_mmaps[j+1].dr_end_off; + tbl->tbl_mmaps[j].buf_start = tbl->tbl_mmaps[j+1].buf_start; + } + memset(&tbl->tbl_mmaps[tbl->tbl_mmap_index-1], 0, sizeof(struct table_mmaps)); + tbl->tbl_mmap_index--; + + left = 0; + right = tbl->tbl_mmap_index - 1; + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, file_off_start, file_off_end); + break; + + case 3: + /* + * |----------------------| + * tstart = fstart tend = fend + */ + for (j = mid; j < tbl->tbl_mmap_index-1; j++) { + tbl->tbl_mmaps[j].file_start_off = tbl->tbl_mmaps[j+1].file_start_off; + tbl->tbl_mmaps[j].dr_start_off = tbl->tbl_mmaps[j+1].dr_start_off; + tbl->tbl_mmaps[j].file_end_off = tbl->tbl_mmaps[j+1].file_end_off; + tbl->tbl_mmaps[j].dr_end_off = tbl->tbl_mmaps[j+1].dr_end_off; + tbl->tbl_mmaps[j].buf_start = tbl->tbl_mmaps[j+1].buf_start; + } + memset(&tbl->tbl_mmaps[tbl->tbl_mmap_index-1], 0, sizeof(struct table_mmaps)); + tbl->tbl_mmap_index--; + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, file_off_start, file_off_end); + return; + + case 4: + /* + * |-------|------------|----------| + * tstart fstart fend tend + */ + for (j = tbl_idx-1; j >= mid; j--) { + tbl->tbl_mmaps[j+1].file_start_off = tbl->tbl_mmaps[j].file_start_off; + tbl->tbl_mmaps[j+1].dr_start_off = tbl->tbl_mmaps[j].dr_start_off; + tbl->tbl_mmaps[j+1].file_end_off = tbl->tbl_mmaps[j].file_end_off; + tbl->tbl_mmaps[j+1].dr_end_off = tbl->tbl_mmaps[j].dr_end_off; + tbl->tbl_mmaps[j+1].buf_start = tbl->tbl_mmaps[j].buf_start; + } + shift_len = tbl->tbl_mmaps[mid].file_end_off - file_off_start + 1; + tbl->tbl_mmaps[mid].file_end_off = file_off_start - 1; + if (tbl->tbl_mmaps[mid].dr_end_off != 0) + tbl->tbl_mmaps[mid].dr_end_off -= shift_len; + + if (tbl->tbl_mmaps[mid].dr_end_off < 0) + assert(0); + + shift_len = file_off_end + 1 - tbl->tbl_mmaps[mid+1].file_start_off; + tbl->tbl_mmaps[mid+1].file_start_off = file_off_end + 1; + tbl->tbl_mmaps[mid+1].buf_start += shift_len; + tbl->tbl_mmaps[mid+1].dr_start_off += shift_len; + + if (tbl->tbl_mmaps[mid+1].dr_end_off < 0) + assert(0); + + tbl->tbl_mmap_index++; + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, file_off_start, file_off_end); + return; + + case 5: + /* + * |-------|---------------|------------| + * tstart fstart tend fend + */ + shift_len = tbl->tbl_mmaps[mid].file_end_off - file_off_start + 1; + tbl->tbl_mmaps[mid].file_end_off = file_off_start - 1; + if (tbl->tbl_mmaps[mid].dr_end_off != 0) + tbl->tbl_mmaps[mid].dr_end_off -= shift_len; + + if (tbl->tbl_mmaps[mid].dr_end_off < 0) + assert(0); + + left = 0; + right = tbl->tbl_mmap_index - 1; + + DEBUG_FILE("%s: OVERLAPPING\n", __func__); + break; + + case 6: + /* + * |------|----------------| + * tstart fstart tend = fend + */ + shift_len = tbl->tbl_mmaps[mid].file_end_off - file_off_start + 1; + tbl->tbl_mmaps[mid].file_end_off = file_off_start - 1; + if (tbl->tbl_mmaps[mid].dr_end_off != 0) + tbl->tbl_mmaps[mid].dr_end_off -= shift_len; + + if (tbl->tbl_mmaps[mid].dr_end_off < 0) + assert(0); + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, file_off_start, file_off_end); + return; + + case 7: + /* + * |-------|------------|-------------| + * fstart tstart fend tend + */ + shift_len = file_off_end + 1 - tbl->tbl_mmaps[mid].file_start_off; + tbl->tbl_mmaps[mid].file_start_off = file_off_end + 1; + tbl->tbl_mmaps[mid].dr_start_off += shift_len; + tbl->tbl_mmaps[mid].buf_start += shift_len; + + if (tbl->tbl_mmaps[mid].dr_end_off < 0) + assert(0); + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, file_off_start, file_off_end); + return; + + case 8: + /* + * |-------|---------------|-----------| + * fstart tstart tend fend + */ + for (j = mid; j < tbl->tbl_mmap_index-1; j++) { + tbl->tbl_mmaps[j+1].file_start_off = tbl->tbl_mmaps[j].file_start_off; + tbl->tbl_mmaps[j+1].dr_start_off = tbl->tbl_mmaps[j].dr_start_off; + tbl->tbl_mmaps[j+1].file_end_off = tbl->tbl_mmaps[j].file_end_off; + tbl->tbl_mmaps[j+1].dr_end_off = tbl->tbl_mmaps[j].dr_end_off; + tbl->tbl_mmaps[j+1].buf_start = tbl->tbl_mmaps[j].buf_start; + } + memset(&tbl->tbl_mmaps[tbl->tbl_mmap_index-1], 0, sizeof(struct table_mmaps)); + tbl->tbl_mmap_index--; + + left = 0; + right = tbl->tbl_mmap_index - 1; + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, file_off_start, file_off_end); + break; + + case 9: + /* + * |------|----------------| + * fstart tstart tend = fend + */ + for (j = mid; j < tbl->tbl_mmap_index-1; j++) { + tbl->tbl_mmaps[j+1].file_start_off = tbl->tbl_mmaps[j].file_start_off; + tbl->tbl_mmaps[j+1].dr_start_off = tbl->tbl_mmaps[j].dr_start_off; + tbl->tbl_mmaps[j+1].file_end_off = tbl->tbl_mmaps[j].file_end_off; + tbl->tbl_mmaps[j+1].dr_end_off = tbl->tbl_mmaps[j].dr_end_off; + tbl->tbl_mmaps[j+1].buf_start = tbl->tbl_mmaps[j].buf_start; + } + memset(&tbl->tbl_mmaps[tbl->tbl_mmap_index-1], 0, sizeof(struct table_mmaps)); + tbl->tbl_mmap_index--; + + DEBUG_FILE("%s: OVERLAPPING. i = %d, Tbl Idx = %d, Tbl_start = %lld, Tbl_end = %lld, File_start = %lld, File_end = %lld\n", __func__, mid, tbl->tbl_mmap_index, tbl->tbl_mmaps[mid].file_start_off, tbl->tbl_mmaps[mid].file_end_off, file_off_start, file_off_end); + return; + }; + + goto bin_search; +} + +static int find_idx_to_insert_large(off_t file_off_start, + struct NVTable_regions *regions) +{ + int region_id = file_off_start / REGION_COVERAGE; + return regions[region_id].tbl_mmap_index; +} + +static int find_idx_to_read_large(off_t file_off_start, + struct NVTable_regions *regions, + int *region_num) +{ + int region_id = file_off_start / REGION_COVERAGE; + int idx = 0, idx_seq = 0; + int cur_region = 0; + + idx = region_id; + while (idx >= 0 && idx >= region_id - 1) { + if ((regions[idx].lowest_off <= file_off_start) && (regions[idx].highest_off >= file_off_start)) { + cur_region = idx; + break; + } + idx--; + } + + if (idx == -1 || idx == region_id - 2) + return -1; + + idx = 0; + while (idx < regions[cur_region].tbl_mmap_index) { + if (regions[cur_region].tbl_mmaps[idx].file_end_off >= file_off_start) { + if (regions[cur_region].tbl_mmaps[idx].file_start_off <= file_off_start) { + idx_seq = idx; + break; + } + } + idx++; + } + if (idx == regions[cur_region].tbl_mmap_index) + idx_seq = -1; + + *region_num = cur_region; + return idx_seq; +} + + + +static int find_idx_to_insert(off_t file_off_start, + struct NVTable_maps *tbl) +{ + int i = 0, idx_bin = 0, idx_seq = 0; + int left = 0, right = tbl->tbl_mmap_index - 1; + int mid; + + if (right >= NUM_OVER_TBL_MMAP_ENTRIES - 1) + assert(0); + + if (right < left) { + idx_bin = 0; + return idx_bin; + } + + while (left <= right) { + mid = (right + left) / 2; + + if (tbl->tbl_mmaps[mid].file_end_off >= file_off_start) { + if (mid == 0) { + idx_bin = mid; + return idx_bin; + } + if (tbl->tbl_mmaps[mid-1].file_end_off < file_off_start) { + idx_bin = mid; + return idx_bin; + } + if (tbl->tbl_mmaps[mid-1].file_end_off >= file_off_start) { + right = mid - 1; + continue; + } + } else { + left = mid + 1; + continue; + } + } + + idx_bin = tbl->tbl_mmap_index; + + return idx_bin; +} + +#endif // DATA_JOURNALING_ENABLED + +static int find_idx_to_read(off_t file_off_start, + struct NVTable_maps *tbl) +{ + int i = 0, idx_bin = 0, idx_seq = 0; + int left = 0, right = tbl->tbl_mmap_index - 1; + int mid = (right + left) / 2; + + if (right < left) { + return -1; + } + + if (mid < 0) + assert(0); + if (left < 0) + assert(0); + if (right < 0) + assert(0); + + if (right >= NUM_OVER_TBL_MMAP_ENTRIES - 1) + assert(0); + + while (left <= right) { + mid = (right + left) / 2; + + if (tbl->tbl_mmaps[mid].file_end_off < file_off_start) { + left = mid + 1; + continue; + } + + if (tbl->tbl_mmaps[mid].file_end_off >= file_off_start && + tbl->tbl_mmaps[mid].file_start_off <= file_off_start) { + idx_bin = mid; + goto out; + } + + if (tbl->tbl_mmaps[mid].file_end_off >= file_off_start && + tbl->tbl_mmaps[mid].file_start_off > file_off_start) { + right = mid - 1; + continue; + } + } + + idx_bin = -1; + + out: + return idx_bin; +} + +void insert_tbl_mmap_entry(struct NVNode *node, + off_t file_off_start, + off_t dr_off_start, + size_t length, + unsigned long buf_start) +{ + off_t prev_off_start = 0, prev_off_end = 0, prev_size, file_off_end = 0, dr_off_end = 0; + unsigned long prev_buf_start = 0, prev_buf_end = 0; + int index = node->serialno % APPEND_TBL_MAX; + int newest_tbl_idx = _nvp_tbl_mmaps[index].tbl_mmap_index; + + DEBUG_FILE("%s: Requesting Insert of file start = %lu, length = %lu, buf_start = %p. Inode = %lu\n", + __func__, file_off_start, length, buf_start, node->serialno); + + if (newest_tbl_idx == 0) + goto add_entry; + + prev_off_start = _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx-1].file_start_off; + prev_off_end = _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx-1].file_end_off; + prev_buf_start = _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx-1].buf_start; + prev_size = prev_off_end - prev_off_start + 1; + prev_buf_end = prev_buf_start + prev_size - 1; + file_off_end = file_off_start + length - 1; + dr_off_end = dr_off_start + length - 1; + + if ((buf_start == prev_buf_end + 1) && + (file_off_start == prev_off_end + 1)) { + DEBUG_FILE("%s: extending previous table mmap to include the next region in file\n", __func__); + _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx-1].file_end_off = file_off_end; + _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx-1].dr_end_off = dr_off_end; + return; + } + + add_entry: + file_off_end = file_off_start + length - 1; + _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx].file_start_off = file_off_start; + _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx].dr_start_off = dr_off_start; + _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx].file_end_off = file_off_end; + _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx].dr_end_off = dr_off_end; + _nvp_tbl_mmaps[index].tbl_mmaps[newest_tbl_idx].buf_start = buf_start; + _nvp_tbl_mmaps[index].tbl_mmap_index++; + if (_nvp_tbl_mmaps[index].tbl_mmap_index >= NUM_APP_TBL_MMAP_ENTRIES) + assert(0); + DEBUG_FILE("%s: Inserting entry. address = %p, tbl file start = %lu, tbl file end = %lu. Tbl IDX = %d. Inode = %lu\n", + __func__, buf_start, file_off_start, file_off_end, newest_tbl_idx, node->serialno); +} + +#if DATA_JOURNALING_ENABLED + +void insert_over_tbl_mmap_entry(struct NVNode *node, + off_t file_off_start, + off_t dr_off_start, + size_t length, + unsigned long buf_start) +{ + int index = node->serialno % APPEND_TBL_MAX; + int reg_index = node->serialno % LARGE_TBL_MAX; + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[index]; + struct NVTable_maps *tbl_append = &_nvp_tbl_mmaps[index]; + struct NVTable_regions *regions = NULL, *regions_2 = NULL; + int region_id = 0; + size_t overlap_len = length; + off_t file_off_end = file_off_start + length - 1; + off_t dr_off_end = dr_off_start + length - 1; + int idx_to_insert_large = 0, idx_to_insert_small = 0, idx_to_insert_large_2 = -1, idx_to_insert = 0, i = 0; + off_t prev_off_start = 0, prev_off_end = 0; + off_t prev_buf_start = 0, prev_buf_end = 0; + off_t prev_dr_end = 0; + size_t prev_size = 0; + int tbl_entries_added = 0, tbl_entries_added_2 = 0; + off_t prev_off_start_large = 0, prev_off_end_large = 0; + off_t prev_buf_start_large = 0, prev_buf_end_large = 0; + off_t prev_dr_end_large = 0; + + + DEBUG_FILE("%s: Requesting Insert of file start = %lu, length = %lu, buf_start = %p. Inode = %lu\n", + __func__, file_off_start, length, buf_start, node->serialno); + + if (node->is_large_file) { + regions = _nvp_tbl_regions[reg_index].regions; + region_id = file_off_start / REGION_COVERAGE; + if (region_id > _nvp_tbl_regions[reg_index].num_regions) { + _nvp_tbl_regions[reg_index].num_regions = region_id; + } + } + + clear_overlapping_entry(file_off_start, length, tbl_append); + if (node->is_large_file) { + DEBUG_FILE("%s: Checked overlap for appends, now overwrites\n", __func__); + tbl_entries_added = clear_overlapping_entry_large(file_off_start, length, regions); + _nvp_tbl_regions[reg_index].num_tbl_mmaps += tbl_entries_added; + } + else + clear_overlapping_entry(file_off_start, length, tbl_over); + + if (node->is_large_file) { + idx_to_insert_large = find_idx_to_insert_large(file_off_start, regions); + if (idx_to_insert_large == 0) + goto shift_and_add; + else + goto merge_entries; + } else { + idx_to_insert = find_idx_to_insert(file_off_start, tbl_over); + if (idx_to_insert == 0) + goto shift_and_add; + } + + merge_entries: + if (node->is_large_file) { + prev_off_start_large = regions[region_id].tbl_mmaps[idx_to_insert_large-1].file_start_off; + prev_off_end_large = regions[region_id].tbl_mmaps[idx_to_insert_large-1].file_end_off; + prev_size = prev_off_end_large - prev_off_start_large + 1; + prev_buf_start_large = regions[region_id].tbl_mmaps[idx_to_insert_large-1].buf_start; + prev_buf_end_large = prev_buf_start_large + prev_size - 1; + prev_dr_end_large = regions[region_id].tbl_mmaps[idx_to_insert_large-1].dr_end_off; + + if (dr_off_start > dr_off_end) { + MSG("%s: index of entry = %d, dr_off_start = %lld, dr_off_end = %lld\n", __func__, dr_off_start, dr_off_end); + assert(0); + } + + if ((file_off_start == prev_off_end_large + 1) && + (buf_start == prev_buf_end_large + 1) && + (prev_dr_end_large != 0)) { + regions[region_id].tbl_mmaps[idx_to_insert_large-1].file_end_off = file_off_end; + regions[region_id].tbl_mmaps[idx_to_insert_large-1].dr_end_off = dr_off_end; + + if (regions[region_id].highest_off < file_off_end) + regions[region_id].highest_off = file_off_end; + + regions[region_id].region_dirty = 1; + if (_nvp_tbl_regions[reg_index].max_dirty_region < region_id) + _nvp_tbl_regions[reg_index].max_dirty_region = region_id; + if (_nvp_tbl_regions[reg_index].min_dirty_region > region_id) + _nvp_tbl_regions[reg_index].min_dirty_region = region_id; + + DEBUG_FILE("%s: Merging\n", __func__); + return; + } + } + + if (!node->is_large_file) { + prev_off_start = tbl_over->tbl_mmaps[idx_to_insert-1].file_start_off; + prev_off_end = tbl_over->tbl_mmaps[idx_to_insert-1].file_end_off; + prev_size = prev_off_end - prev_off_start + 1; + prev_buf_start = tbl_over->tbl_mmaps[idx_to_insert-1].buf_start; + prev_buf_end = prev_buf_start + prev_size - 1; + prev_dr_end = tbl_over->tbl_mmaps[idx_to_insert-1].dr_end_off; + + if (dr_off_start > dr_off_end) { + MSG("%s: index of entry = %d, dr_off_start = %lld, dr_off_end = %lld\n", __func__, dr_off_start, dr_off_end); + assert(0); + } + + if ((file_off_start == prev_off_end + 1) && + (buf_start == prev_buf_end + 1) && + (prev_dr_end != 0)) { + tbl_over->tbl_mmaps[idx_to_insert-1].file_end_off = file_off_end; + tbl_over->tbl_mmaps[idx_to_insert-1].dr_end_off = dr_off_end; + return; + } + } + + shift_and_add: + if (node->is_large_file) { + regions[region_id].tbl_mmaps[idx_to_insert_large].file_start_off = file_off_start; + regions[region_id].tbl_mmaps[idx_to_insert_large].dr_start_off = dr_off_start; + regions[region_id].tbl_mmaps[idx_to_insert_large].file_end_off = file_off_end; + regions[region_id].tbl_mmaps[idx_to_insert_large].dr_end_off = dr_off_end; + regions[region_id].tbl_mmaps[idx_to_insert_large].buf_start = buf_start; + regions[region_id].tbl_mmap_index++; + _nvp_tbl_regions[index].num_tbl_mmaps++; + + if (regions[region_id].lowest_off > file_off_start) + regions[region_id].lowest_off = file_off_start; + if (regions[region_id].highest_off < file_off_end) + regions[region_id].highest_off = file_off_end; + + DEBUG_FILE("%s: No Merge\n", __func__); + regions[region_id].region_dirty = 1; + if (_nvp_tbl_regions[reg_index].max_dirty_region < region_id) + _nvp_tbl_regions[reg_index].max_dirty_region = region_id; + if (_nvp_tbl_regions[reg_index].min_dirty_region > region_id) + _nvp_tbl_regions[reg_index].min_dirty_region = region_id; + + return; + } + + for (i = tbl_over->tbl_mmap_index-1; i >= idx_to_insert; i--) { + tbl_over->tbl_mmaps[i+1].file_start_off = tbl_over->tbl_mmaps[i].file_start_off; + tbl_over->tbl_mmaps[i+1].dr_start_off = tbl_over->tbl_mmaps[i].dr_start_off; + tbl_over->tbl_mmaps[i+1].file_end_off = tbl_over->tbl_mmaps[i].file_end_off; + tbl_over->tbl_mmaps[i+1].dr_end_off = tbl_over->tbl_mmaps[i].dr_end_off; + tbl_over->tbl_mmaps[i+1].buf_start = tbl_over->tbl_mmaps[i].buf_start; + } + + tbl_over->tbl_mmaps[idx_to_insert].file_start_off = file_off_start; + tbl_over->tbl_mmaps[idx_to_insert].dr_start_off = dr_off_start; + tbl_over->tbl_mmaps[idx_to_insert].file_end_off = file_off_end; + tbl_over->tbl_mmaps[idx_to_insert].dr_end_off = dr_off_end; + tbl_over->tbl_mmaps[idx_to_insert].buf_start = buf_start; + tbl_over->tbl_mmap_index++; + + DEBUG_FILE("%s: Inserting entry. address = %p, tbl file start = %lu, tbl file end = %lu. Tbl IDX = %d. IDX to insert = %d. Inode = %lu\n", + __func__, buf_start, file_off_start, file_off_end, tbl_over->tbl_mmap_index, idx_to_insert, node->serialno); +} + +#endif // DATA_JOURNALING_ENABLED + +int read_tbl_mmap_entry(struct NVNode *node, + off_t file_off_start, + size_t length, + unsigned long *mmap_addr, + size_t *extent_length, + int check_append_entry) +{ + off_t tbl_mmap_off_start = 0, tbl_mmap_off_end = 0, + off_start_diff = 0, effective_tbl_mmap_off_start = 0; + size_t tbl_mmap_entry_len = 0; + int i = 0; + int app_index = node->serialno % APPEND_TBL_MAX; + int over_index = node->serialno % OVER_TBL_MAX; + int reg_index = node->serialno % LARGE_TBL_MAX; + int idx = 0, idx_2 = 0, idx_small = 0, region_id = 0, region_id_2 = 0; + size_t extent_length_2 = 0, extent_length_small = 0; + unsigned long mmap_addr_2 = 0, mmap_addr_small = 0; + struct NVTable_maps *tbl_app = &_nvp_tbl_mmaps[app_index]; + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[over_index]; + struct NVTable_regions *regions = NULL, *regions_2 = NULL;; + + DEBUG_FILE("%s: inode number = %lu. offset to read = %lld\n", + __func__, node->serialno, file_off_start); + +#if DATA_JOURNALING_ENABLED + + if (node->is_large_file) { + regions = _nvp_tbl_regions[reg_index].regions; + region_id = file_off_start / REGION_COVERAGE; + } + + if (node->is_large_file) { + idx = find_idx_to_read_large(file_off_start, regions, ®ion_id); + } + else + idx = find_idx_to_read(file_off_start, tbl_over); + + if (idx != -1) { + if (node->is_large_file) { + off_start_diff = file_off_start - regions[region_id].tbl_mmaps[idx].file_start_off; + effective_tbl_mmap_off_start = regions[region_id].tbl_mmaps[idx].file_start_off + off_start_diff; + tbl_mmap_entry_len = regions[region_id].tbl_mmaps[idx].file_end_off - effective_tbl_mmap_off_start + 1; + if (tbl_mmap_entry_len > length) + tbl_mmap_entry_len = length; + + *extent_length = tbl_mmap_entry_len; + *mmap_addr = regions[region_id].tbl_mmaps[idx].buf_start + off_start_diff; + + DEBUG_FILE("%s: LARGE reading address = %p, size = %lu, tbl file start = %lu, tbl file end = %lu, Tbl IDX = %d. Inode = %lu\n", + __func__, *mmap_addr, *extent_length, regions[region_id].tbl_mmaps[idx].file_start_off, regions[region_id].tbl_mmaps[idx].file_end_off, idx, node->serialno); + + return 0; + + } else { + off_start_diff = file_off_start - tbl_over->tbl_mmaps[idx].file_start_off; + effective_tbl_mmap_off_start = tbl_over->tbl_mmaps[idx].file_start_off + off_start_diff; + tbl_mmap_entry_len = tbl_over->tbl_mmaps[idx].file_end_off - effective_tbl_mmap_off_start + 1; + if (tbl_mmap_entry_len > length) + tbl_mmap_entry_len = length; + *extent_length = tbl_mmap_entry_len; + *mmap_addr = tbl_over->tbl_mmaps[idx].buf_start + off_start_diff; + return 0; + DEBUG_FILE("%s: reading address = %p, size = %lu, tbl file start = %lu, tbl file end = %lu, Tbl IDX = %d. Inode = %lu\n", + __func__, *mmap_addr, *extent_length, tbl_mmap_off_start, tbl_mmap_off_end, i, node->serialno); + } + } + +#endif // DATA_JOURNALING_ENABLED + + if (check_append_entry) { + idx = find_idx_to_read(file_off_start, tbl_app); + if (idx != -1) { + off_start_diff = file_off_start - tbl_app->tbl_mmaps[idx].file_start_off; + effective_tbl_mmap_off_start = tbl_app->tbl_mmaps[idx].file_start_off + off_start_diff; + tbl_mmap_entry_len = tbl_app->tbl_mmaps[idx].file_end_off - effective_tbl_mmap_off_start + 1; + if (tbl_mmap_entry_len > length) + tbl_mmap_entry_len = length; + *extent_length = tbl_mmap_entry_len; + *mmap_addr = tbl_app->tbl_mmaps[idx].buf_start + off_start_diff; + DEBUG_FILE("%s: reading address = %p, size = %lu, tbl file start = %lu, tbl file end = %lu, Tbl IDX = %d. Inode = %lu\n", + __func__, *mmap_addr, *extent_length, tbl_app->tbl_mmaps[idx].file_start_off, tbl_app->tbl_mmaps[idx].file_end_off, idx, node->serialno); + return 0; + } + } + + *mmap_addr = 0; + return 0; +} + +int clear_tbl_mmap_entry(struct NVTable_maps *tbl, int num_entries) +{ + int i = 0; + size_t len = 0; + off_t offset_in_page = 0; + + DEBUG_FILE("%s: Number of mmap entries = %d\n", __func__, tbl->tbl_mmap_index); + if (tbl->tbl_mmap_index > 0) { + deleted_size += tbl->tbl_mmaps[tbl->tbl_mmap_index-1].file_end_off; + DEBUG_FILE("%s: Total size deleted = %lu\n", __func__, deleted_size); + memset((void *)tbl->tbl_mmaps, 0, num_entries*sizeof(struct table_mmaps)); + tbl->tbl_mmap_index = 0; + } + + return 0; + +#if 0 + int i = 0; + size_t len = 0; + off_t offset_in_page = 0; + + DEBUG_FILE("%s: Number of mmap entries = %d\n", __func__, tbl->tbl_mmap_index); + for (i = 0; i < tbl->tbl_mmap_index; i++) { + len = tbl->tbl_mmaps[i].file_end_off - tbl->tbl_mmaps[i].file_start_off + 1; + munmap((void *)tbl->tbl_mmaps[i].buf_start, len); + deleted_size += len; + } + DEBUG_FILE("%s: Total size deleted = %lu\n", __func__, deleted_size); + if (tbl->tbl_mmap_index) { + memset((void *)tbl->tbl_mmaps, 0, NUM_APP_TBL_MMAP_ENTRIES*sizeof(struct table_mmaps)); + tbl->tbl_mmap_index = 0; + } + return 0; +#endif +} diff --git a/splitfs_syscall_intercept/src/tbl_mmaps.h b/splitfs_syscall_intercept/src/tbl_mmaps.h new file mode 100644 index 0000000000..12da1b7822 --- /dev/null +++ b/splitfs_syscall_intercept/src/tbl_mmaps.h @@ -0,0 +1,119 @@ +// Header file for nvmfileops.c + +#ifndef __LEDGER_TBL_MMAPS_H_ +#define __LEDGER_TBL_MMAPS_H_ + +#include +#include +#include +#include +#include "inode.h" +#include "nvp_lock.h" + +struct table_mmaps +{ + off_t file_start_off; + off_t file_end_off; + off_t dr_start_off; + off_t dr_end_off; + unsigned long buf_start; +}; + +struct NVTable_maps +{ + NVP_LOCK_DECL; + struct table_mmaps *tbl_mmaps; + int tbl_mmap_index; +}; + +struct NVTable_regions +{ + struct table_mmaps *tbl_mmaps; + off_t lowest_off; + off_t highest_off; + int tbl_mmap_index; + int region_dirty; +}; + +struct NVLarge_maps +{ + NVP_LOCK_DECL; + struct NVTable_regions *regions; + int num_tbl_mmaps; + int num_regions; + int min_dirty_region; + int max_dirty_region; +}; + +extern struct NVTable_maps *_nvp_tbl_mmaps; +extern struct NVTable_maps *_nvp_over_tbl_mmaps; +extern struct NVLarge_maps *_nvp_tbl_regions; + +void get_lowest_tbl_elem(off_t *over_file_start, + off_t *over_file_end, + off_t *over_dr_start, + off_t *over_dr_end, + struct NVTable_maps *tbl, + int idx_in_over); + +void get_tbl_elem_large(off_t *over_file_start, + off_t *over_file_end, + off_t *over_dr_start, + off_t *over_dr_end, + struct table_mmaps *tbl_mmaps, + int idx_in_over); + +int get_lowest_tbl_elem_large(off_t *over_file_start, + off_t *over_file_end, + off_t *over_dr_start, + off_t *over_dr_end, + struct table_mmaps *tbl_mmaps, + int tbl_mmap_index, + off_t max_value); + +void insert_tbl_mmap_entry(struct NVNode *node, + off_t file_off_start, + off_t dr_off_start, + size_t length, + unsigned long buf_start); +void insert_over_tbl_mmap_entry(struct NVNode *node, + off_t file_off_start, + off_t dr_off_start, + size_t length, + unsigned long buf_start); +int read_tbl_mmap_entry(struct NVNode *node, + off_t file_off_start, + size_t length, + unsigned long *mmap_addr, + size_t *extent_length, + int check_append_entry); +int clear_tbl_mmap_entry(struct NVTable_maps *tbl, int num_entries); + +#define TBL_MMAP_LOCKING 1 +#if TBL_MMAP_LOCKING + +#define TBL_ENTRY_LOCK_RD(tbl, cpuid) {if (tbl) {NVP_LOCK_RD(tbl->lock, cpuid);}} +#define TBL_ENTRY_UNLOCK_RD(tbl, cpuid) {if (tbl) {NVP_LOCK_UNLOCK_RD(tbl->lock, cpuid);}} +#define TBL_ENTRY_LOCK_WR(tbl) {if (tbl) {NVP_LOCK_WR(tbl->lock);}} +#define TBL_ENTRY_UNLOCK_WR(tbl) {if (tbl) {NVP_LOCK_UNLOCK_WR(tbl->lock);}} + +#else + +#define TBL_ENTRY_LOCK_RD(tbl, cpuid) {(void)(cpuid);} +#define TBL_ENTRY_UNLOCK_RD(tbl, cpuid) {(void)(cpuid);} +#define TBL_ENTRY_LOCK_WR(tbl) {(void)(tbl->lock);} +#define TBL_ENTRY_UNLOCK_WR(tbl) {(void)(tbl->lock);} + +#endif + +#define LARGE_TBL_MAX 5 +#define APPEND_TBL_MAX 4096 +#define OVER_TBL_MAX 4096 +#define NUM_APP_TBL_MMAP_ENTRIES 1024 +#define NUM_OVER_TBL_MMAP_ENTRIES 1024 + +#define REGION_COVERAGE (40*1024) +#define LARGE_TBL_REGIONS (500*1024*1024 / REGION_COVERAGE) +#define PER_REGION_TABLES 100 // (REGION_COVERAGE / 1024) + +#endif diff --git a/splitfs_syscall_intercept/src/thread_handle.c b/splitfs_syscall_intercept/src/thread_handle.c new file mode 100644 index 0000000000..17bbe6d4f1 --- /dev/null +++ b/splitfs_syscall_intercept/src/thread_handle.c @@ -0,0 +1,171 @@ +#include "thread_handle.h" + +void activateBgThread() { + + pthread_mutex_lock(&mu); + + run_background_thread = 1; + pthread_cond_signal(&bgsignal); + + pthread_mutex_unlock(&mu); + + //bgCloseFiles(); +} + +void *bgThreadWrapper() { + + start: + pthread_mutex_lock(&mu); + + waiting_for_signal = 1; + while(!run_background_thread) { + pthread_cond_wait(&bgsignal, &mu); + } + + waiting_for_signal = 0; + + pthread_mutex_unlock(&mu); + + bgCloseFiles(0); + + if(!exit_bgthread) + goto start; + + started_bgthread = 0; + + return NULL; +} + +void startBgThread() { + + if (!started_bgthread) { + started_bgthread = 1; + pthread_create(&bgthread, NULL, &bgThreadWrapper, NULL); + } +} + +void waitForBgThread() { + if(started_bgthread) { + pthread_join(bgthread, NULL); + } +} + +void cancelBgThread() { + if(started_bgthread) { + pthread_cancel(bgthread); + pthread_testcancel(); + } +} + +void initEnvForBg() { + pthread_cond_init(&bgsignal, NULL); + pthread_mutex_init(&mu, NULL); +} + +void checkAndActivateBgThread() { + if(run_background_thread) + return; + if(dr_mem_closed_files > lim_dr_mem_closed || num_files_closed > lim_num_files || cleanup) { + calledBgThread++; + activateBgThread(); + } +} + +void bgCloseFiles(int main_thread) { + + instrumentation_type clf_lock_time, bg_thread_time; + int closed_filedesc = -1; + ino_t closed_serialno = 0; +#if SEQ_LIST || RAND_LIST + struct ClosedFiles *clf = NULL; + int i=0; +#else //SEQ_LIST || RAND_LIST + struct InodeClosedFile *tbl = NULL; + int hash_index = -1; +#endif //SEQ_LIST || RAND_LIST + + START_TIMING(bg_thread_t, bg_thread_time); + + GLOBAL_LOCK_CLOSE_WR(); + +#if SEQ_LIST + for (i = 0; i < 1024; i++) { + if (!num_files_closed) + break; + + clf = &_nvp_closed_files[i]; + LRU_NODE_LOCK_WR(clf); + + closed_filedesc = remove_from_seq_list(clf, &closed_serialno); + long clo_res = 0; + _sfs_REAL_CLOSE(closed_filedesc, closed_serialno, 1, &clo_result) + if (!clo_result) { + __atomic_fetch_sub(&num_files_closed, 1, __ATOMIC_SEQ_CST); + } + + LRU_NODE_UNLOCK_WR(clf); + } + +#elif RAND_LIST + while(num_files_closed) { + if (dr_mem_closed_files <= (500*1024*1024) && num_files_closed < 500 && !cleanup) { + ASYNC_CLOSING = 1; + break; + } + + i = rand() % TOTAL_CLOSED_INODES; + clf = &_nvp_closed_files[i]; + + if (clf->fd == -1) + continue; + + START_TIMING(clf_lock_t, clf_lock_time); + LRU_NODE_LOCK_WR(clf); + END_TIMING(clf_lock_t, clf_lock_time); + + closed_filedesc = remove_from_seq_list(clf, &closed_serialno); + + long clo_res = 0; + _sfs_REAL_CLOSE(closed_filedesc, closed_serialno, 1, &clo_res); + if (!clo_res) { + __atomic_fetch_sub(&num_files_closed, 1, __ATOMIC_SEQ_CST); + if (!main_thread) + num_async_close++; + } + + LRU_NODE_UNLOCK_WR(clf); + } +#else + while (num_files_closed) { + hash_index = lru_tail_serialno % 1024; + tbl = &inode_to_closed_file[hash_index]; + NVP_LOCK_HASH_TABLE_WR(tbl); + + if (lru_tail_serialno == 0 || tbl->index == -1) { + NVP_UNLOCK_HASH_TABLE_WR(tbl); + break; + } + + if (dr_mem_closed_files <= (500*1024*1024) && !cleanup) { + NVP_UNLOCK_HASH_TABLE_WR(tbl); + break; + } + + closed_filedesc = remove_from_lru_list_policy(&closed_serialno); + long clo_res = 0; + _sfs_REAL_CLOSE(closed_filedesc, closed_serialno, 1, &clo_res) + if (!clo_res) { + __atomic_fetch_sub(&num_files_closed, 1, __ATOMIC_SEQ_CST); + } + + NVP_UNLOCK_HASH_TABLE_WR(tbl); + } + +#endif + + GLOBAL_UNLOCK_CLOSE_WR(); + + END_TIMING(bg_thread_t, bg_thread_time); + + run_background_thread = 0; +} diff --git a/splitfs_syscall_intercept/src/thread_handle.h b/splitfs_syscall_intercept/src/thread_handle.h new file mode 100644 index 0000000000..417f84099f --- /dev/null +++ b/splitfs_syscall_intercept/src/thread_handle.h @@ -0,0 +1,41 @@ +#ifndef __NV_THREAD_HANDLER_H_ +#define __NV_THREAD_HANDLER_H_ + +#include +#include +#include "file.h" +#include "lru_cache.h" +#include "timers.h" +#include "nvp_lock.h" + +pthread_t bgthread; +pthread_cond_t bgsignal; +pthread_mutex_t mu; + +uint64_t lim_dr_mem_closed; +uint64_t lim_num_files; +uint64_t lim_dr_mem; + +int run_background_thread; +int started_bgthread; +int exit_bgthread; +int calledBgThread; +int waiting_for_signal; +int cleanup; + +void activateBgThread(); + +void *bgThreadWrapper(); + +void startBgThread(); + +void waitForBgThread(); + +void cancelBgThread(); + +void initEnvForBg(); + +void checkAndActivateBgThread(); + +void bgCloseFiles(int main_thread); +#endif diff --git a/splitfs_syscall_intercept/src/timers.c b/splitfs_syscall_intercept/src/timers.c new file mode 100644 index 0000000000..d7478ae34a --- /dev/null +++ b/splitfs_syscall_intercept/src/timers.c @@ -0,0 +1,171 @@ +#include "timers.h" + +unsigned int num_open; +unsigned int num_close; +unsigned int num_async_close; +unsigned int num_read; +unsigned int num_write; +unsigned int num_stat; +unsigned int num_unlink; +unsigned int num_appendfsync; +unsigned int num_memcpy_read; +unsigned int num_anon_read; +unsigned int num_memcpy_write; +unsigned int num_append_write; +unsigned int num_posix_read; +unsigned int num_posix_write; +unsigned int num_fsync; +unsigned int num_mfence; +unsigned int num_write_nontemporal; +unsigned int num_write_temporal; +unsigned int num_clflushopt; +unsigned int num_mmap; +unsigned int num_drs; +unsigned int num_drs_critical_path; +unsigned long long appendfsync_size; +unsigned long long non_temporal_write_size; +unsigned long long temporal_write_size; +unsigned long long read_size; +unsigned long long write_size; +unsigned long long memcpy_read_size; +unsigned long long anon_read_size; +unsigned long long memcpy_write_size; +unsigned long long append_write_size; +unsigned long long posix_read_size; +unsigned long long posix_write_size; +unsigned long long total_syscalls; +unsigned long long deleted_size; +volatile size_t _nvp_wr_extended; +volatile size_t _nvp_wr_total; +atomic_uint_fast64_t num_drs_left; + +void nvp_init_io_stats(void) +{ + num_open = 0; + num_close = 0; + num_async_close = 0; + num_read = 0; + num_write = 0; + num_stat = 0; + num_unlink = 0; + num_appendfsync = 0; + num_memcpy_read = 0; + num_anon_read = 0; + num_memcpy_write = 0; + num_append_write = 0; + num_posix_read = 0; + num_posix_write = 0; + num_fsync = 0; + num_mfence = 0; + num_write_nontemporal = 0; + num_write_temporal = 0; + num_clflushopt = 0; + num_mmap = 0; + num_drs = 0; + num_drs_critical_path = 0; + appendfsync_size = 0; + non_temporal_write_size = 0; + temporal_write_size = 0; + read_size = 0; + write_size = 0; + memcpy_read_size = 0; + anon_read_size = 0; + memcpy_write_size = 0; + append_write_size = 0; + posix_read_size = 0; + posix_write_size = 0; + total_syscalls = 0; + deleted_size = 0; + _nvp_wr_extended = 0; + size_t _nvp_wr_total = 0; + num_drs_left = 0; + +} + +void nvp_print_io_stats(void) +{ + MSG("====================== NVP IO stats: ======================\n"); + MSG("open %u, close %u, async close %u\n", num_open, num_close, num_async_close); + MSG("mmap %u, unlink %u, stat %u\n", num_mmap, num_unlink, num_stat); + MSG("dr mmap %u, dr mmap critical path %u\n", num_drs, num_drs_critical_path); + MSG("fsync %u, appendfsync: count %u size %llu average %llu\n", + num_fsync, num_appendfsync, appendfsync_size, + num_appendfsync ? appendfsync_size / num_appendfsync : 0); + MSG("READ: count %u, size %llu, average %llu\n", num_read, + read_size, num_read ? read_size / num_read : 0); + MSG("WRITE: count %u, size %llu, average %llu\n", num_write, + write_size, num_write ? write_size / num_write : 0); + MSG("memcpy READ: count %u, size %llu, average %llu\n", + num_memcpy_read, memcpy_read_size, + num_memcpy_read ? memcpy_read_size / num_memcpy_read : 0); + MSG("anon READ: count %u, size %llu, average %llu\n", + num_anon_read, anon_read_size, + num_anon_read ? anon_read_size / num_anon_read : 0); + MSG("memcpy WRITE: count %u, size %llu, average %llu\n", + num_memcpy_write, memcpy_write_size, + num_memcpy_write ? memcpy_write_size / num_memcpy_write : 0); + MSG("anon WRITE: count %u, size %llu, average %llu\n", + num_append_write, append_write_size, + num_append_write ? append_write_size / num_append_write : 0); + MSG("posix READ: count %u, size %llu, average %llu\n", + num_posix_read, posix_read_size, + num_posix_read ? posix_read_size / num_posix_read : 0); + MSG("posix WRITE: count %u, size %llu, average %llu\n", + num_posix_write, posix_write_size, + num_posix_write ? posix_write_size / num_posix_write : 0); + MSG("write extends %lu, total %lu\n", _nvp_wr_extended, + _nvp_wr_total); + MSG("MFENCE: count %u\n", + num_mfence); + MSG("CLFLUSHOPT: count %u\n", + num_clflushopt); + MSG("NON_TEMPORAL_WRITES: count %u, size %llu, average %llu\n", + num_write_nontemporal, non_temporal_write_size, + num_write_nontemporal ? non_temporal_write_size / num_write_nontemporal : 0); + MSG("TEMPORAL WRITES: count %u, size %llu, average %llu\n", + num_write_temporal, temporal_write_size, + num_write_temporal ? temporal_write_size / num_write_temporal : 0); + MSG("TOTAL SYSCALLS (open + close + read + write + fsync): count %llu\n", + num_open + num_close + num_posix_read + num_posix_write); +} + +const char *Instruprint[INSTRUMENT_NUM] = +{ + "open", + "close", + "pread", + "pwrite", + "read", + "write", + "seek", + "fsync", + "unlink", + "bg_thread", + "clf_lock", + "node_lookup_lock", + "nvnode_lock", + "dr_mem_queue", + "file_mmap", + "close_syscall", + "copy_to_dr_pool", + "copy_to_mmap_cache", + "appends", + "clear_dr", + "swap_extents", + "give_up_node", + "get_mmap", + "get_dr_mmap", + "copy_overread", + "copy_overwrite", + "copy_appendread", + "copy_appendwrite", + "read_tbl_mmap", + "insert_tbl_mmap", + "clear_mmap_tbl", + "append_log_entry", + "op_log_entry", + "append_log_reinit", + "remove_overlapping_entry", + "device", + "soft_overhead", +}; diff --git a/splitfs_syscall_intercept/src/timers.h b/splitfs_syscall_intercept/src/timers.h new file mode 100644 index 0000000000..e21a552332 --- /dev/null +++ b/splitfs_syscall_intercept/src/timers.h @@ -0,0 +1,137 @@ +#ifndef _LEDGER_TIMERS_H_ +#define _LEDGER_TIMERS_H_ + +#include +#include +#include +#include +#include + +extern unsigned int num_open; +extern unsigned int num_close; +extern unsigned int num_async_close; +extern unsigned int num_read; +extern unsigned int num_write; +extern unsigned int num_stat; +extern unsigned int num_unlink; +extern unsigned int num_appendfsync; +extern unsigned int num_memcpy_read; +extern unsigned int num_anon_read; +extern unsigned int num_memcpy_write; +extern unsigned int num_append_write; +extern unsigned int num_posix_read; +extern unsigned int num_posix_write; +extern unsigned int num_fsync; +extern unsigned int num_mfence; +extern unsigned int num_write_nontemporal; +extern unsigned int num_write_temporal; +extern unsigned int num_clflushopt; +extern unsigned int num_mmap; +extern unsigned int num_drs; +extern unsigned int num_drs_critical_path; +extern unsigned long long appendfsync_size; +extern unsigned long long non_temporal_write_size; +extern unsigned long long temporal_write_size; +extern unsigned long long read_size; +extern unsigned long long write_size; +extern unsigned long long memcpy_read_size; +extern unsigned long long anon_read_size; +extern unsigned long long memcpy_write_size; +extern unsigned long long append_write_size; +extern unsigned long long posix_read_size; +extern unsigned long long posix_write_size; +extern unsigned long long total_syscalls; +extern unsigned long long deleted_size; +extern volatile size_t _nvp_wr_extended; +extern volatile size_t _nvp_wr_total; +extern atomic_uint_fast64_t num_drs_left; + +void nvp_init_io_stats(void); +void nvp_print_io_stats(void); + +enum instrumentation_vars { + open_t, + close_t, + pread_t, + pwrite_t, + read_t, + write_t, + seek_t, + fsync_t, + unlink_t, + bg_thread_t, + clf_lock_t, + node_lookup_lock_t, + nvnode_lock_t, + dr_mem_queue_t, + file_mmap_t, + close_syscall_t, + copy_to_dr_pool_t, + copy_to_mmap_cache_t, + appends_t, + clear_dr_t, + swap_extents_t, + give_up_node_t, + get_mmap_t, + get_dr_mmap_t, + copy_overread_t, + copy_overwrite_t, + copy_appendread_t, + copy_appendwrite_t, + read_tbl_mmap_t, + insert_tbl_mmap_t, + clear_mmap_tbl_t, + append_log_entry_t, + op_log_entry_t, + append_log_reinit_t, + remove_overlapping_entry_t, + device_t, + soft_overhead_t, + INSTRUMENT_NUM, +}; + +static atomic_uint_least64_t Instrustats[INSTRUMENT_NUM]; +extern const char *Instruprint[INSTRUMENT_NUM]; +typedef struct timespec instrumentation_type; + +#define INITIALIZE_TIMERS() \ + { \ + int i; \ + for (i = 0; i < INSTRUMENT_NUM; i++) \ + Instrustats[i] = 0; \ + } \ + +#if INSTRUMENT_CALLS + +#define START_TIMING(name, start) \ + { \ + clock_gettime(CLOCK_MONOTONIC, &start); \ + } + +#define END_TIMING(name, start) \ + { \ + instrumentation_type end; \ + clock_gettime(CLOCK_MONOTONIC, &end); \ + __atomic_fetch_add(&Instrustats[name], (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec), __ATOMIC_SEQ_CST); \ + } + +#define PRINT_TIME() \ + { \ + int i; \ + for(i=0; i +#include + +#include +#include "nvp_lock.h" +#include "tbl_mmaps.h" +#include "timers.h" +#include "mmap_cache.h" +#include "inode.h" +#include "file.h" +#include "handle_mmaps.h" +#include "log.h" +#include "lru_cache.h" + +RETT_SYSCALL_INTERCEPT _sfs_UNLINK(INTF_SYSCALL) +{ + struct stat file_st; + int index, tbl_mmap_idx, over_tbl_mmap_idx; + struct InodeToMapping* mappingToBeRemoved; + instrumentation_type unlink_time, clf_lock_time, clear_mmap_tbl_time, op_log_entry_time; + char *path; + +#if BG_CLOSING + int hash_index = -1, closed_filedesc = -1, stale_fd = 0; + ino_t closed_serialno = 0; +#if SEQ_LIST || RAND_LIST + struct ClosedFiles *clf = NULL; +#else //SEQ_LIST || RAND_LIST + struct InodeClosedFile *tbl = NULL; +#endif //SEQ_LIST || RAND_LIST +#endif //BG_CLOSING + + START_TIMING(unlink_t, unlink_time); + GLOBAL_LOCK_WR(); + + num_stat++; + + DEBUG("CALL: %s\n", __func__); + path = (char *)arg0; + + if (stat(path, &file_st) == 0) { + index = file_st.st_ino % OPEN_MAX; + tbl_mmap_idx = file_st.st_ino % APPEND_TBL_MAX; + struct NVTable_maps *tbl_app = &_nvp_tbl_mmaps[tbl_mmap_idx]; + +#if DATA_JOURNALING_ENABLED + over_tbl_mmap_idx = file_st.st_ino % OVER_TBL_MAX; + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[over_tbl_mmap_idx]; +#else + struct NVTable_maps *tbl_over = NULL; +#endif // DATA_JOURNALING_ENABLED + + DEBUG_FILE("%s: Deleting file: %s. Inode = %lu\n", __func__, path, file_st.st_ino); + + START_TIMING(clear_mmap_tbl_t, clear_mmap_tbl_time); + + TBL_ENTRY_LOCK_WR(tbl_app); + TBL_ENTRY_LOCK_WR(tbl_over); + clear_tbl_mmap_entry(tbl_app, NUM_APP_TBL_MMAP_ENTRIES); + +#if DATA_JOURNALING_ENABLED + + clear_tbl_mmap_entry(tbl_over, NUM_OVER_TBL_MMAP_ENTRIES); + +#endif // DATA_JOURNALING_ENABLED + + TBL_ENTRY_UNLOCK_WR(tbl_over); + TBL_ENTRY_UNLOCK_WR(tbl_app); + END_TIMING(clear_mmap_tbl_t, clear_mmap_tbl_time); + +#if BG_CLOSING + GLOBAL_LOCK_CLOSE_WR(); + hash_index = file_st.st_ino % TOTAL_CLOSED_INODES; +#if SEQ_LIST || RAND_LIST + clf = &_nvp_closed_files[hash_index]; + + LRU_NODE_LOCK_WR(clf); + + stale_fd = remove_from_seq_list_hash(clf, file_st.st_ino); +#else //SEQ_LIST || RAND_LIST + tbl = &inode_to_closed_file[hash_index]; + NVP_LOCK_HASH_TABLE_WR(tbl); + stale_fd = remove_from_lru_list_hash(file_st.st_ino, 0); +#endif //SEQ_LIST || RAND_LIST + if(stale_fd >= 0) { + closed_filedesc = stale_fd; + closed_serialno = file_st.st_ino; + + long clo_res = 0; + _sfs_REAL_CLOSE(closed_filedesc, closed_serialno, 1, &clo_res); + if(!clo_res) + __atomic_fetch_sub(&num_files_closed, 1, __ATOMIC_SEQ_CST); + } +#if SEQ_LIST || RAND_LIST + LRU_NODE_UNLOCK_WR(clf); +#else //SEQ_LIST || RAND_LIST + NVP_UNLOCK_HASH_TABLE_WR(tbl); +#endif //SEQ_LIST || RAND_LIST + GLOBAL_UNLOCK_CLOSE_WR(); +#endif //BG_CLOSING + + mappingToBeRemoved = &_nvp_ino_mapping[index]; + if(file_st.st_ino == mappingToBeRemoved->serialno && mappingToBeRemoved->root_dirty_num) { + nvp_free_btree(mappingToBeRemoved->root, mappingToBeRemoved->merkle_root, mappingToBeRemoved->height, mappingToBeRemoved->root_dirty_cache, mappingToBeRemoved->root_dirty_num, mappingToBeRemoved->total_dirty_mmaps); + mappingToBeRemoved->serialno = 0; + } + } + num_unlink++; + *result = syscall_no_intercept(SYS_unlink, path); + +#if !POSIX_ENABLED + if(*result == 0) { + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_FILE_UNLINK, + path, + NULL, + 0, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); + } +#endif + + END_TIMING(unlink_t, unlink_time); + GLOBAL_UNLOCK_WR(); + return RETT_NO_PASS_KERN; +} + +RETT_SYSCALL_INTERCEPT _sfs_UNLINKAT(INTF_SYSCALL) +{ + instrumentation_type op_log_entry_time; + int dirfd, flags; + char *path; + + DEBUG("CALL: _sfs_UNLINKAT\n"); + + dirfd = (int)arg0; + path = (char *)arg1; + flags = (int)arg2; + + //GLOBAL_LOCK_WR(); + *result = syscall_no_intercept(SYS_unlinkat, dirfd, path, flags); + +#if !POSIX_ENABLED + START_TIMING(op_log_entry_t, op_log_entry_time); + persist_op_entry(LOG_FILE_UNLINK, + path, + NULL, + 0, + 0); + END_TIMING(op_log_entry_t, op_log_entry_time); +#endif + return RETT_NO_PASS_KERN; +} diff --git a/splitfs_syscall_intercept/src/util.h b/splitfs_syscall_intercept/src/util.h new file mode 100755 index 0000000000..9649ab27b8 --- /dev/null +++ b/splitfs_syscall_intercept/src/util.h @@ -0,0 +1,44 @@ +#ifndef _UTIL_H_ +#define _UTIL_H_ + +#ifdef __GNUC__ +#define TYPEOF(x) (__typeof__(x)) +#else +#define TYPEOF(x) +#endif + +#if defined(__i386__) + +static inline unsigned long long asm_rdtsc(void) +{ + unsigned long long int x; + __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); + return x; +} + +static inline unsigned long long asm_rdtscp(void) +{ + unsigned hi, lo; + __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)::"ecx"); + return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); + +} +#elif defined(__x86_64__) + +static inline unsigned long long asm_rdtsc(void) +{ + unsigned hi, lo; + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); + return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); +} + +static inline unsigned long long asm_rdtscp(void) +{ + unsigned hi, lo; + __asm__ __volatile__ ("rdtscp" : "=a"(lo), "=d"(hi)::"rcx"); + return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); +} +#else +#error "Only support for X86 architecture" +#endif +#endif diff --git a/splitfs_syscall_intercept/src/utils.c b/splitfs_syscall_intercept/src/utils.c new file mode 100644 index 0000000000..22abdb75a1 --- /dev/null +++ b/splitfs_syscall_intercept/src/utils.c @@ -0,0 +1,9 @@ +#include "utils.h" + +size_t align_next_page(size_t address) { + return ((address + PAGE_SIZE) >> PAGE_SHIFT) << PAGE_SHIFT; +} + +size_t align_cur_page(size_t address) { + return (address >> PAGE_SHIFT) << PAGE_SHIFT; +} \ No newline at end of file diff --git a/splitfs_syscall_intercept/src/utils.h b/splitfs_syscall_intercept/src/utils.h new file mode 100644 index 0000000000..42f5850575 --- /dev/null +++ b/splitfs_syscall_intercept/src/utils.h @@ -0,0 +1,24 @@ +#ifndef _SPLITFS_UTILS_H +#define _SPLITFS_UTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1 << 12) +#define PAGE_MASK = ~(PAGE_SIZE - 1) + +size_t align_next_page(size_t address); +size_t align_cur_page(size_t address); + +#endif diff --git a/splitfs_syscall_intercept/src/write.c b/splitfs_syscall_intercept/src/write.c new file mode 100644 index 0000000000..782ded7056 --- /dev/null +++ b/splitfs_syscall_intercept/src/write.c @@ -0,0 +1,767 @@ +/* + * ===================================================================================== + * + * Filename: write.c + * + * Description: + * + * Version: 1.0 + * Created: 09/25/2019 03:19:46 PM + * Revision: none + * Compiler: gcc + * + * Author: YOUR NAME (), + * Organization: + * + * ===================================================================================== + */ +// required for sched_getcpu (GET_CPUID) +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif +#include +#include + +#include +#include "timers.h" +#include "add_delay.h" +#include "handle_mmaps.h" +#include "tbl_mmaps.h" +#include "nvp_lock.h" +#include "file.h" +#include "inode.h" +#include "staging.h" +#include "log.h" +#include "relink.h" + +#if !DATA_JOURNALING_ENABLED +static ssize_t write_to_file_mmap(int file, + off_t write_offset_within_true_length, + size_t len_to_write_within_true_length, + int wr_lock, + int cpuid, + const void *buf, + struct NVFile *nvf) +{ + int ret = 0; + unsigned long mmap_addr = 0, bitmap_root = 0; + off_t offset_within_mmap = 0; + size_t extent_length = 0, write_count = 0, posix_write = 0, data_written = 0; + instrumentation_type copy_overwrite_time, get_mmap_time; + + while(len_to_write_within_true_length > 0) { + START_TIMING(get_mmap_t, get_mmap_time); + ret = nvp_get_mmap_address(nvf, + write_offset_within_true_length, + write_count, + &mmap_addr, + &bitmap_root, + &offset_within_mmap, + &extent_length, + wr_lock, + cpuid, + NULL, + NULL); + END_TIMING(get_mmap_t, get_mmap_time); + + switch (ret) { + case 0: // Mmaped. Do memcpy. + break; + case 1: // Not mmaped. Calling Posix pread. + posix_write = syscall_no_intercept(SYS_pwrite64, file, + buf, + len_to_write_within_true_length, + write_offset_within_true_length); + num_posix_write++; + posix_write_size += posix_write; + return posix_write; + default: + break; + } + + if (extent_length > len_to_write_within_true_length) + extent_length = len_to_write_within_true_length; + + START_TIMING(copy_overwrite_t, copy_overwrite_time); + +#if NON_TEMPORAL_WRITES + DEBUG_FILE("%s: args: mmap_addr = %p, offset in mmap = %lu, length to write = %lu\n", __func__, (char *)mmap_addr, offset_within_mmap, extent_length); + if(MEMCPY_NON_TEMPORAL((char *)mmap_addr, buf, extent_length) == NULL) { + printf("%s: non-temporal memcpy failed\n", __func__); + fflush(NULL); + assert(0); + } + num_write_nontemporal++; + non_temporal_write_size += extent_length; +#else //NON_TEMPORAL_WRITES + if(FSYNC_MEMCPY((char *)mmap_addr, buf, extent_length) == NULL) { + printf("%s: non-temporal memcpy failed\n", __func__); + fflush(NULL); + assert(0); + } +#endif //NON TEMPORAL WRITES +#if NVM_DELAY + perfmodel_add_delay(0, extent_length); +#endif + num_memcpy_write++; + memcpy_write_size += extent_length; + len_to_write_within_true_length -= extent_length; + write_offset_within_true_length += extent_length; + buf += extent_length; + data_written += extent_length; + num_mfence++; + _mm_sfence(); + + END_TIMING(copy_overwrite_t, copy_overwrite_time); + } + return data_written; +} +#endif + +/* + * _nvp_extend_write gets called whenever there is an append to a file. The write first goes to the + * anonymous memory region through memcpy. During fsync() time, the data is copied non-temporally from + * anonymous DRAM to the file. + */ +static ssize_t _nvp_extend_write(int file, const void *buf, size_t count, off_t offset, + int wr_lock, + int cpuid, + struct NVFile *nvf, + struct NVTable_maps *tbl_app, + struct NVTable_maps *tbl_over) +{ + + size_t len_to_write, write_count; + off_t write_offset; + instrumentation_type get_dr_mmap_time, copy_appendwrite_time, clear_dr_time, swap_extents_time; + instrumentation_type device_time; + + // Increment counter for append + _nvp_wr_extended++; + num_memcpy_write++; + num_append_write++; + DEBUG("Request write length %li will extend file. " + "(filelen=%li, offset=%li, count=%li, extension=%li)\n", + count, nvf->node->length, offset, count, extension); + len_to_write = count; + write_count = 0; + write_offset = offset; + DEBUG_FILE("%s: requesting write of size %lu, offset = %lu. FD = %d\n", __func__, count, offset, nvf->fd); + unsigned long mmap_addr; + off_t offset_within_mmap, write_offset_wrt_true_length; + size_t extent_length, extension_with_node_length; + instrumentation_type append_log_entry_time; + extension_with_node_length = 0; + + get_addr: + /* This is used mostly to check if the write is not an append, + * but is way beyond the length of the file. + */ + write_offset_wrt_true_length = write_offset - (off_t) nvf->node->true_length; + DEBUG_FILE("%s: write offset = %lu, true length = %lu\n", __func__, write_offset, nvf->node->true_length); + // The address to perform the memcpy to is got from this function. + START_TIMING(get_dr_mmap_t, get_dr_mmap_time); + + nvp_get_dr_mmap_address(nvf, write_offset_wrt_true_length, len_to_write, + write_count, &mmap_addr, &offset_within_mmap, + &extent_length, wr_lock, cpuid, 1, tbl_app, tbl_over); + DEBUG_FILE("%s: extent_length = %lu, len_to_write = %lu\n", + __func__, extent_length, len_to_write); + + END_TIMING(get_dr_mmap_t, get_dr_mmap_time); + + DEBUG_FILE("%s: ### EXTENT_LENGTH >= LEN_TO_WRITE extent_length = %lu, len_to_write = %lu\n", __func__, + extent_length, len_to_write); + if (extent_length < len_to_write) { + nvf->node->dr_info.dr_offset_end -= extent_length; + swap_extents(nvf, nvf->node->true_length); + off_t offset_in_page = 0; + nvf->node->true_length = nvf->node->length; + if (nvf->node->true_length >= LARGE_FILE_THRESHOLD) + nvf->node->is_large_file = 1; + START_TIMING(clear_dr_t, clear_dr_time); + DEBUG_FILE("%s: EXTENT_LENGTH < LEN_TO_WRITE, EXTENT FD = %d, extent_length = %lu, len_to_write = %lu\n", + __func__, + nvf->node->dr_info.dr_fd, + extent_length, + len_to_write); +#if BG_CLEANING + change_dr_mmap(nvf->node, 0); +#else + create_dr_mmap(nvf->node, 0); +#endif + END_TIMING(clear_dr_t, clear_dr_time); + + offset_in_page = (off_t) (nvf->node->true_length) % MMAP_PAGE_SIZE; + if (offset_in_page != 0 && nvf->node->dr_info.valid_offset < DR_SIZE) { + nvf->node->dr_info.valid_offset += (unsigned long) offset_in_page; + nvf->node->dr_info.dr_offset_start = DR_SIZE; + nvf->node->dr_info.dr_offset_end = nvf->node->dr_info.valid_offset; + } + if (nvf->node->dr_info.valid_offset == DR_SIZE) { + nvf->node->dr_info.valid_offset = DR_SIZE; + nvf->node->dr_info.dr_offset_start = DR_SIZE; + nvf->node->dr_info.dr_offset_end = DR_SIZE; + } + DEBUG_FILE("%s: RECEIVED OTHER EXTENT. dr fd = %d, dr addr = %p, dr v.o = %lu, dr start off = %lu, dr end off = %lu\n", + __func__, nvf->node->dr_info.dr_fd, nvf->node->dr_info.start_addr, nvf->node->dr_info.valid_offset, + nvf->node->dr_info.dr_offset_start, nvf->node->dr_info.dr_offset_end); + + if (tbl_over != NULL) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + } + if (tbl_app != NULL) { + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + } + NVP_UNLOCK_NODE_WR(nvf); + NVP_LOCK_NODE_RD(nvf, cpuid); + if (tbl_app != NULL) { + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + } + if (tbl_over != NULL) { + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + } + DEBUG_FILE("%s: Cleared mmap\n", __func__); + goto get_addr; + } + if (extent_length > len_to_write) + extent_length = len_to_write; + if((extent_length + (size_t) write_offset) > nvf->node->length) + extension_with_node_length = extent_length + (size_t)write_offset - nvf->node->length; + + if ((mmap_addr % MMAP_PAGE_SIZE) != (nvf->node->length % MMAP_PAGE_SIZE)) + assert(0); + + nvf->node->length += extension_with_node_length; + + memcpy_write_size += extent_length; + append_write_size += extent_length; + + if (!wr_lock) { + if (tbl_over != NULL) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + } + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_WR(nvf); + NVP_LOCK_NODE_RD(nvf, cpuid); + if (tbl_app != NULL) { + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + } + if (tbl_over != NULL) { + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + } + } + +#if SYSCALL_APPENDS + + offset_within_mmap = write_offset - nvf->node->true_length; + syscall_no_intercept(SYS_pwrite, nvf->node->dr_info.dr_fd, buf, extent_length, offset_within_mmap); + syscall_no_intercept(SYS_fsync, nvf->fd); + +#else // SYSCALL APPENDS + + // Write to anonymous DRAM. No dirty tracking to be performed here. + START_TIMING(copy_appendwrite_t, copy_appendwrite_time); + START_TIMING(device_t, device_time); + + DEBUG_FILE("%s: memcpy args: buf = %p, mmap_addr = %p, length = %lu. File off = %lld. Inode = %lu\n", __func__, buf, (void *) mmap_addr, extent_length, write_offset, nvf->node->serialno); + if(MEMCPY_NON_TEMPORAL((char *)mmap_addr, buf, extent_length) == NULL) { + printf("%s: non-temporal memcpy failed\n", __func__); + fflush(NULL); + assert(0); + } + //_mm_sfence(); + //num_mfence++; + num_write_nontemporal++; + non_temporal_write_size += extent_length; + +#if NVM_DELAY + perfmodel_add_delay(0, extent_length); +#endif // NVM_DELAY + + END_TIMING(device_t, device_time); + END_TIMING(copy_appendwrite_t, copy_appendwrite_time); + + if (tbl_over != NULL) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + } + if (tbl_app != NULL) { + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + } + if (!wr_lock) { + NVP_UNLOCK_NODE_RD(nvf, cpuid); + } else { + NVP_UNLOCK_NODE_WR(nvf); + } + NVP_UNLOCK_FD_RD(nvf, cpuid); + // Log the append + +#if !POSIX_ENABLED + + START_TIMING(append_log_entry_t, append_log_entry_time); + persist_append_entry(nvf->node->serialno, + nvf->node->dr_info.dr_serialno, + offset, + offset_within_mmap, + extent_length); + END_TIMING(append_log_entry_t, append_log_entry_time); +#endif + +#endif // SYSCALL APPENDS + + len_to_write -= extent_length; + write_offset += extent_length; + write_count += extent_length; + buf += extent_length; + + DEBUG_FILE("%s: Returning write count = %lu. FD = %d\n", __func__, write_count, nvf->fd); + return (ssize_t) write_count; +} + +ssize_t _nvp_do_pwrite(int file, const void *buf, size_t count, off_t offset, + int wr_lock, + int cpuid, + struct NVFile *nvf, + struct NVTable_maps *tbl_app, + struct NVTable_maps *tbl_over) +{ + off_t write_offset, offset_within_mmap; + size_t write_count, extent_length; + size_t posix_write; + unsigned long mmap_addr = 0; + unsigned long bitmap_root = 0; + uint64_t extendFileReturn; + instrumentation_type appends_time, read_tbl_mmap_time, copy_overwrite_time, get_dr_mmap_time, + append_log_entry_time, clear_dr_time, insert_tbl_mmap_time; + DEBUG_FILE("%s: fd = %d, offset = %lu, count = %lu\n", __func__, file, offset, count); + _nvp_wr_total++; + + SANITYCHECKNVF(nvf); + if(UNLIKELY(!nvf->canWrite)) { + DEBUG("FD not open for writing: %i\n", file); + errno = EBADF; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + NVP_UNLOCK_FD_RD(nvf, cpuid); + return -1; + } + if(nvf->aligned) + { + DEBUG("This write must be aligned. Checking alignment.\n"); + if(UNLIKELY(count % 512)) + { + DEBUG("count is not aligned to 512 (count was %li)\n", + count); + errno = EINVAL; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + NVP_UNLOCK_FD_RD(nvf, cpuid); + return -1; + } + if(UNLIKELY(offset % 512)) + { + DEBUG("offset was not aligned to 512 " + "(offset was %li)\n", offset); + errno = EINVAL; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + NVP_UNLOCK_FD_RD(nvf, cpuid); + return -1; + } + + if(UNLIKELY(((long long int)buf & (512-1)) != 0)) + { + DEBUG("buffer was not aligned to 512 (buffer was %p, " + "mod 512 = %li)\n", buf, + (long long int)buf % 512); + errno = EINVAL; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + NVP_UNLOCK_FD_RD(nvf, cpuid); + return -1; + } + } + if(nvf->append) + { + DEBUG("this fd (%i) is O_APPEND; setting offset from the " + "passed value (%li) to the end of the file (%li) " + "prior to writing anything\n", nvf->fd, offset, + nvf->node->length); + offset = nvf->node->length; + } + + ssize_t len_to_write; + ssize_t extension_with_read_length; + DEBUG("time for a Pwrite. file length %li, offset %li, extension %li, count %li\n", nvf->node->length, offset, extension, count); + + len_to_write = count; + + SANITYCHECK(nvf->valid); + SANITYCHECK(nvf->node != NULL); + SANITYCHECK(buf > 0); + SANITYCHECK(count >= 0); + + write_count = 0; + write_offset = offset; + + if (write_offset >= nvf->node->length + 1) { + DEBUG_FILE("%s: Hole getting created. Doing Write system call\n", __func__); + posix_write = syscall_no_intercept(SYS_pwrite64, file, buf, count, write_offset); + syscall_no_intercept(SYS_fsync, file); + num_posix_write++; + posix_write_size += posix_write; + if (!wr_lock) { + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_RD(nvf, cpuid); + + NVP_LOCK_NODE_WR(nvf); + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + } + if (write_offset + count <= nvf->node->length) { + DEBUG_FILE("%s: offset fault. Offset of write = %lu, count = %lu, node length = %lu\n", __func__, write_offset, count, nvf->node->length); + assert(0); + } + + nvf->node->length = write_offset + count; + nvf->node->true_length = nvf->node->length; + if (nvf->node->true_length >= LARGE_FILE_THRESHOLD) + nvf->node->is_large_file = 1; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + NVP_UNLOCK_NODE_WR(nvf); + NVP_UNLOCK_FD_RD(nvf, cpuid); + return posix_write; + } + + if (write_offset == nvf->node->length) + goto appends; + + if (write_offset >= nvf->node->true_length) { + MSG("%s: write_offset = %lu, true_length = %lu\n", __func__, write_offset, nvf->node->true_length); + assert(0); + } + +#if DATA_JOURNALING_ENABLED + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_LOCK_WR(tbl_over); + + // Get the file backed mmap address to which the write is to be performed. + get_addr: + START_TIMING(get_dr_mmap_t, get_dr_mmap_time); + + nvp_get_over_dr_address(nvf, write_offset, len_to_write, + &mmap_addr, &offset_within_mmap, + &extent_length, wr_lock, cpuid, + tbl_app, tbl_over); + DEBUG_FILE("%s: extent_length = %lu, len_to_write = %lu\n", + __func__, extent_length, len_to_write); + END_TIMING(get_dr_mmap_t, get_dr_mmap_time); + + if (extent_length < len_to_write) { + //size_t len_swapped = swap_extents(nvf, nvf->node->true_length); + off_t offset_in_page = 0; + START_TIMING(clear_dr_t, clear_dr_time); + DEBUG_FILE("%s: EXTENT_LENGTH < LEN_TO_WRITE, EXTENT FD = %d, extent_length = %lu, len_to_write = %lu\n", + __func__, + nvf->node->dr_info.dr_fd, + extent_length, + len_to_write); +#if BG_CLEANING + change_dr_mmap(nvf->node, 1); +#else + create_dr_mmap(nvf->node, 1); +#endif + END_TIMING(clear_dr_t, clear_dr_time); + + DEBUG_FILE("%s: RECEIVED OTHER EXTENT. dr over fd = %d, dr over addr = %p, dr over start off = %lu, dr over end off = %lu\n", __func__, nvf->node->dr_over_info.dr_fd, nvf->node->dr_over_info.start_addr, nvf->node->dr_over_info.dr_offset_start, nvf->node->dr_over_info.dr_offset_end); + + TBL_ENTRY_UNLOCK_WR(tbl_over); + TBL_ENTRY_UNLOCK_WR(tbl_app); + NVP_UNLOCK_NODE_WR(nvf); + NVP_LOCK_NODE_RD(nvf, cpuid); + + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_WR(tbl_over); + goto get_addr; + } + + TBL_ENTRY_UNLOCK_WR(tbl_over); + TBL_ENTRY_UNLOCK_WR(tbl_app); + NVP_UNLOCK_NODE_WR(nvf); + NVP_LOCK_NODE_RD(nvf, cpuid); + + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + +#else // DATA_JOURNALING_ENABLED + + START_TIMING(read_tbl_mmap_t, read_tbl_mmap_time); + read_tbl_mmap_entry(nvf->node, write_offset, + len_to_write, &mmap_addr, + &extent_length, 1); + END_TIMING(read_tbl_mmap_t, read_tbl_mmap_time); + + if (mmap_addr == 0) { + extent_length = write_to_file_mmap(file, write_offset, + len_to_write, wr_lock, + cpuid, buf, + nvf); + + goto post_write; + } + +#endif // DATA_JOURNALING_ENABLED + + if (extent_length > len_to_write) + extent_length = len_to_write; + + // The write is performed to file backed mmap + START_TIMING(copy_overwrite_t, copy_overwrite_time); + +#if NON_TEMPORAL_WRITES + + DEBUG_FILE("%s: memcpy args: buf = %p, mmap_addr = %p, length = %lu. File off = %lld. Inode = %lu\n", __func__, buf, (void *) mmap_addr, extent_length, write_offset, nvf->node->serialno); + + if(MEMCPY_NON_TEMPORAL((char *)mmap_addr, buf, extent_length) == NULL) { + printf("%s: non-temporal memcpy failed\n", __func__); + fflush(NULL); + assert(0); + } + _mm_sfence(); + num_mfence++; + num_write_nontemporal++; + non_temporal_write_size += extent_length; + +#else //NON_TEMPORAL_WRITES + + if(FSYNC_MEMCPY((char *)mmap_addr, buf, extent_length) != (char *)mmap_addr) { + printf("%s: memcpy failed\n", __func__); + fflush(NULL); + assert(0); + } + +#if DIRTY_TRACKING + + modifyBmap((struct merkleBtreeNode *)bitmap_root, offset_within_mmap, extent_length); + +#endif //DIRTY_TRACKING + + num_memcpy_write++; + +#endif //NON_TEMPORAL_WRITES + +#if NVM_DELAY + perfmodel_add_delay(0, extent_length); +#endif + + END_TIMING(copy_overwrite_t, copy_overwrite_time); + +#if DATA_JOURNALING_ENABLED + + START_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + insert_over_tbl_mmap_entry(nvf->node, + write_offset, + offset_within_mmap, + extent_length, + mmap_addr); + END_TIMING(insert_tbl_mmap_t, insert_tbl_mmap_time); + +#endif // DATA_JOURNALING_ENABLED + +#if !DATA_JOURNALING_ENABLED + post_write: +#endif + memcpy_write_size += extent_length; + len_to_write -= extent_length; + write_offset += extent_length; + write_count += extent_length; + buf += extent_length; + + TBL_ENTRY_UNLOCK_RD(tbl_over, cpuid); + TBL_ENTRY_UNLOCK_RD(tbl_app, cpuid); + + NVP_UNLOCK_NODE_RD(nvf, cpuid); + NVP_UNLOCK_FD_RD(nvf, cpuid); + +#if DATA_JOURNALING_ENABLED + + START_TIMING(append_log_entry_t, append_log_entry_time); + persist_append_entry(nvf->node->serialno, + nvf->node->dr_over_info.dr_serialno, + write_offset, + offset_within_mmap, + extent_length); + END_TIMING(append_log_entry_t, append_log_entry_time); + +#endif // DATA_JOURNALING_ENABLED + + return write_count; + + // If we need to append data, we should call _nvp_extend_write to write to anonymous mmap. + appends: + START_TIMING(appends_t, appends_time); + extendFileReturn = _nvp_extend_write(file, buf, + len_to_write, + write_offset, + wr_lock, cpuid, + nvf, + tbl_app, + tbl_over); + END_TIMING(appends_t, appends_time); + len_to_write -= extendFileReturn; + write_count += extendFileReturn; + write_offset += extendFileReturn; + buf += extendFileReturn; + + DEBUG("About to return from _nvp_PWRITE with ret val %li. file len: " + "%li, file off: %li, map len: %li, node %p\n", + count, nvf->node->length, nvf->offset, + nvf->node->maplength, nvf->node); + return write_count; + } + +static ssize_t _nvp_check_write_size_valid(size_t count) +{ + if(count == 0) + { + DEBUG("Requested a write of 0 bytes. No problem\n"); + return 0; + } + + if(((signed long long int)count) < 0) + { + DEBUG("Requested a write of %li < 0 bytes.\n", + (signed long long int)count); + errno = EINVAL; + return -1; + } + + return count; +} + +RETT_SYSCALL_INTERCEPT _sfs_WRITE(INTF_SYSCALL) +{ + DEBUG("%s: %d\n",__func__, file); + num_write++; + int file, res; + instrumentation_type write_time; + + file = (int)arg0; + + if(!_fd_intercept_lookup[file]) { + return RETT_PASS_KERN; + } + + char *buf; + int length; + + buf = (char *)arg1; + length = (int)arg2; + + START_TIMING(write_t, write_time); + + GLOBAL_LOCK_WR(); + + struct NVFile* nvf = &_nvp_fd_lookup[file]; + + if (nvf->posix) { + DEBUG("Call posix WRITE for fd %d\n", nvf->fd); + res = syscall_no_intercept(SYS_write, file, buf, length); + write_size += res; + num_posix_write++; + posix_write_size += res; + END_TIMING(write_t, write_time); + GLOBAL_UNLOCK_WR(); + *result = res; + return RETT_NO_PASS_KERN; + } + + if (nvf->node == NULL) { + res = syscall_no_intercept(SYS_write, file, buf, length); + write_size += res; + num_posix_write++; + posix_write_size += res; + END_TIMING(write_t, write_time); + GLOBAL_UNLOCK_WR(); + *result = res; + return RETT_NO_PASS_KERN; + } + + int cpuid = GET_CPUID(); + struct NVTable_maps *tbl_app = &_nvp_tbl_mmaps[nvf->node->serialno % APPEND_TBL_MAX]; + +#if DATA_JOURNALING_ENABLED + struct NVTable_maps *tbl_over = &_nvp_over_tbl_mmaps[nvf->node->serialno % OVER_TBL_MAX]; +#else + struct NVTable_maps *tbl_over = NULL; +#endif // DATA_JOURNALING_ENABLED + + res = _nvp_check_write_size_valid(length); + if (res <= 0) { + END_TIMING(write_t, write_time); + GLOBAL_UNLOCK_WR(); + *result = res; + return RETT_NO_PASS_KERN; + } + + NVP_LOCK_FD_RD(nvf, cpuid); // TODO + NVP_LOCK_NODE_RD(nvf, cpuid); //TODO + + TBL_ENTRY_LOCK_RD(tbl_app, cpuid); + TBL_ENTRY_LOCK_RD(tbl_over, cpuid); + + res = _nvp_do_pwrite(file, buf, length, + __sync_fetch_and_add(nvf->offset, length), + 0, + cpuid, + nvf, + tbl_app, + tbl_over); + + if(res >= 0) + { + if(nvf->append) + { + DEBUG("PWRITE succeeded and append == true. " + "Setting offset to end...\n"); + //fflush(NULL); + //assert(_nvp_do_seek64(nvf->fd, 0, SEEK_END, nvf) + // != (RETT_SEEK64)-1); + } + else + { + DEBUG("PWRITE succeeded: extending offset " + "from %li to %li\n", + *nvf->offset - res, *nvf->offset); + } + } + + DEBUG("About to return from _nvp_WRITE with ret val %i (errno %i). " + "file len: %li, file off: %li, map len: %li\n", + res, errno, nvf->node->length, nvf->offset, + nvf->node->maplength); + + write_size += res; + + END_TIMING(write_t, write_time); + GLOBAL_UNLOCK_WR(); + + DEBUG_FILE("%s: Returning %d\n", __func__, res); + if(res == -1) { + *result = -errno; + } + *result = res; + return RETT_NO_PASS_KERN; +} diff --git a/tests/Makefile b/tests/Makefile index 178e9ea316..19418eea3a 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -11,6 +11,9 @@ PJD_DIR=$(CWD)/pjd-fstest-20080816 all: pjd.posix pjd.sync pjd.strict +# Runs all the tests for SplitFS integrated with syscall_intercept +all_sysint: pjd.posix_sysint pjd.sync_sysint pjd.strict_sysint + # Compile with posix specific flags pjd.posix_compile: export LEDGER_DATAJ=0 && \ @@ -18,6 +21,12 @@ pjd.posix_compile: $(MAKE) -C ../splitfs clean && \ $(MAKE) -e -C ../splitfs +pjd.posix_compile_sysint: + export LEDGER_DATAJ=0 && \ + export LEDGER_POSIX=1 && \ + $(MAKE) -C ../splitfs_syscall_intercept/src clean && \ + $(MAKE) -e -C ../splitfs_syscall_intercept/src + # Compile with sync specific flags pjd.sync_compile: export LEDGER_DATAJ=0 && \ @@ -25,6 +34,12 @@ pjd.sync_compile: $(MAKE) -C ../splitfs clean && \ $(MAKE) -e -C ../splitfs +pjd.sync_compile_sysint: + export LEDGER_DATAJ=0 && \ + export LEDGER_POSIX=0 && \ + $(MAKE) -C ../splitfs_syscall_intercept/src clean && \ + $(MAKE) -e -C ../splitfs_syscall_intercept/src + # Compile with strict specific flags pjd.strict_compile: export LEDGER_DATAJ=1 && \ @@ -32,6 +47,12 @@ pjd.strict_compile: $(MAKE) -C ../splitfs clean && \ $(MAKE) -e -C ../splitfs +pjd.strict_compile_sysint: + export LEDGER_DATAJ=1 && \ + export LEDGER_POSIX=0 && \ + $(MAKE) -C ../splitfs_syscall_intercept/src clean && \ + $(MAKE) -e -C ../splitfs_syscall_intercept/src + pjd.compile: $(MAKE) -C $(PJD_DIR) clean $(MAKE) -C $(PJD_DIR) @@ -42,8 +63,15 @@ pjd.run: export LD_PRELOAD=$(ROOT)/splitfs/libnvp.so; \ cd $(SFS_PATH) && prove -r $(PJD_DIR)/tests +pjd.run_sysint: + export LD_PRELOAD=$(ROOT)/splitfs_syscall_intercept/src/libnvp.so; \ + cd $(SFS_PATH) && prove -r $(PJD_DIR)/tests + pjd.posix: pjd.posix_compile pjd.compile pjd.run +pjd.posix_sysint: pjd.posix_compile_sysint pjd.compile pjd.run_sysint pjd.sync: pjd.sync_compile pjd.compile pjd.run +pjd.sync_sysint: pjd.sync_compile_sysint pjd.compile pjd.run_sysint pjd.strict: pjd.strict_compile pjd.compile pjd.run +pjd.strict_sysint: pjd.strict_compile_sysint pjd.compile pjd.run_sysint