Date: Thu, 6 Apr 2017 09:34:54 +0000 (UTC) From: Hans Petter Selasky <hselasky@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r316562 - in head/sys/compat/linuxkpi/common: include/linux src Message-ID: <201704060934.v369Ys50077560@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: hselasky Date: Thu Apr 6 09:34:54 2017 New Revision: 316562 URL: https://svnweb.freebsd.org/changeset/base/316562 Log: Implement proper support for memory map operations in the LinuxKPI, like open, close and fault using the character device pager. Some notes about the implementation: 1) Linux drivers set the vm_ops and vm_private_data fields during a mmap() call to indicate that the driver wants to use the LinuxKPI VM operations. Else these operations are not used. 2) The vm_private_data pointer is associated with a VM area structure and inserted into an internal LinuxKPI list. If the vm_private_data pointer already exists, the existing VM area structure is used instead of the allocated one which gets freed. 3) The LinuxKPI's vm_private_data pointer is used as the callback handle for the FreeBSD VM object. The VM subsystem in FreeBSD has a similar list to identify equal handles and will only call the character device pager's close function once. 4) All LinuxKPI VM operations are serialized through the mmap_sem sempaphore, which is per procedure, which prevents simultaneous access to the shared VM area structure when receiving page faults. Obtained from: kmacy @ MFC after: 1 week Sponsored by: Mellanox Technologies Modified: head/sys/compat/linuxkpi/common/include/linux/mm.h head/sys/compat/linuxkpi/common/include/linux/page.h head/sys/compat/linuxkpi/common/src/linux_compat.c Modified: head/sys/compat/linuxkpi/common/include/linux/mm.h ============================================================================== --- head/sys/compat/linuxkpi/common/include/linux/mm.h Thu Apr 6 09:07:01 2017 (r316561) +++ head/sys/compat/linuxkpi/common/include/linux/mm.h Thu Apr 6 09:34:54 2017 (r316562) @@ -38,6 +38,7 @@ #include <linux/kernel.h> #include <linux/mm_types.h> #include <linux/pfn.h> +#include <linux/list.h> #include <asm/pgtable.h> @@ -89,12 +90,25 @@ CTASSERT((VM_PROT_ALL & -(1 << 8)) == 0) typedef int (*pte_fn_t)(pte_t *, pgtable_t, unsigned long addr, void *data); struct vm_area_struct { - vm_offset_t vm_start; - vm_offset_t vm_end; - vm_offset_t vm_pgoff; - vm_paddr_t vm_pfn; /* PFN For mmap. */ - vm_size_t vm_len; /* length for mmap. */ - vm_memattr_t vm_page_prot; + vm_offset_t vm_start; + vm_offset_t vm_end; + vm_offset_t vm_pgoff; + pgprot_t vm_page_prot; + unsigned long vm_flags; + struct mm_struct *vm_mm; + void *vm_private_data; + const struct vm_operations_struct *vm_ops; + struct linux_file *vm_file; + + /* internal operation */ + vm_paddr_t vm_pfn; /* PFN for memory map */ + vm_size_t vm_len; /* length for memory map */ + vm_pindex_t vm_pfn_first; + int vm_pfn_count; + int *vm_pfn_pcount; + vm_object_t vm_obj; + vm_map_t vm_cached_map; + TAILQ_ENTRY(vm_area_struct) vm_entry; }; struct vm_fault { Modified: head/sys/compat/linuxkpi/common/include/linux/page.h ============================================================================== --- head/sys/compat/linuxkpi/common/include/linux/page.h Thu Apr 6 09:07:01 2017 (r316561) +++ head/sys/compat/linuxkpi/common/include/linux/page.h Thu Apr 6 09:34:54 2017 (r316562) @@ -47,6 +47,28 @@ typedef unsigned long pgprot_t; #define page vm_page +#define LINUXKPI_PROT_VALID (1 << 4) +#define LINUXKPI_CACHE_MODE_SHIFT 3 + +static inline pgprot_t +cachemode2protval(vm_memattr_t attr) +{ + return ((attr | LINUXKPI_PROT_VALID) << LINUXKPI_CACHE_MODE_SHIFT); +} + +static inline vm_memattr_t +pgprot2cachemode(pgprot_t prot) +{ + int val; + + val = prot >> LINUXKPI_CACHE_MODE_SHIFT; + + if (val & LINUXKPI_PROT_VALID) + return (val & ~LINUXKPI_PROT_VALID); + else + return (VM_MEMATTR_DEFAULT); +} + #define virt_to_page(x) PHYS_TO_VM_PAGE(vtophys((x))) #define page_to_pfn(pp) (VM_PAGE_TO_PHYS((pp)) >> PAGE_SHIFT) #define pfn_to_page(pfn) (PHYS_TO_VM_PAGE((pfn) << PAGE_SHIFT)) Modified: head/sys/compat/linuxkpi/common/src/linux_compat.c ============================================================================== --- head/sys/compat/linuxkpi/common/src/linux_compat.c Thu Apr 6 09:07:01 2017 (r316561) +++ head/sys/compat/linuxkpi/common/src/linux_compat.c Thu Apr 6 09:34:54 2017 (r316562) @@ -2,7 +2,7 @@ * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. - * Copyright (c) 2013-2016 Mellanox Technologies, Ltd. + * Copyright (c) 2013-2017 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -88,6 +88,8 @@ MALLOC_DEFINE(M_KMALLOC, "linux", "Linux #undef cdev #define RB_ROOT(head) (head)->rbh_root +static struct vm_area_struct *linux_cdev_handle_find(void *handle); + struct kobject linux_class_root; struct device linux_root_device; struct class linux_class_misc; @@ -394,6 +396,166 @@ linux_file_dtor(void *cdp) } static int +linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, + vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) +{ + struct vm_area_struct *vmap; + struct vm_fault vmf; + int err; + + linux_set_current(curthread); + + /* get VM area structure */ + vmap = linux_cdev_handle_find(vm_obj->handle); + MPASS(vmap != NULL); + MPASS(vmap->vm_private_data == vm_obj->handle); + + /* fill out VM fault structure */ + vmf.virtual_address = (void *)(pidx << PAGE_SHIFT); + vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + vmf.pgoff = 0; + vmf.page = NULL; + + VM_OBJECT_WUNLOCK(vm_obj); + + down_write(&vmap->vm_mm->mmap_sem); + if (unlikely(vmap->vm_ops == NULL)) { + err = VM_FAULT_SIGBUS; + } else { + vmap->vm_pfn_count = 0; + vmap->vm_pfn_pcount = &vmap->vm_pfn_count; + vmap->vm_obj = vm_obj; + + err = vmap->vm_ops->fault(vmap, &vmf); + + while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) { + kern_yield(0); + err = vmap->vm_ops->fault(vmap, &vmf); + } + } + + /* translate return code */ + switch (err) { + case VM_FAULT_OOM: + err = VM_PAGER_AGAIN; + break; + case VM_FAULT_SIGBUS: + err = VM_PAGER_BAD; + break; + case VM_FAULT_NOPAGE: + /* + * By contract the fault handler will return having + * busied all the pages itself. If pidx is already + * found in the object, it will simply xbusy the first + * page and return with vm_pfn_count set to 1. + */ + *first = vmap->vm_pfn_first; + *last = *first + vmap->vm_pfn_count - 1; + err = VM_PAGER_OK; + break; + default: + err = VM_PAGER_ERROR; + break; + } + up_write(&vmap->vm_mm->mmap_sem); + VM_OBJECT_WLOCK(vm_obj); + return (err); +} + +static struct rwlock linux_vma_lock; +static TAILQ_HEAD(, vm_area_struct) linux_vma_head = + TAILQ_HEAD_INITIALIZER(linux_vma_head); + +static struct vm_area_struct * +linux_cdev_handle_insert(void *handle, struct vm_area_struct *vmap) +{ + struct vm_area_struct *ptr; + + rw_wlock(&linux_vma_lock); + TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) { + if (ptr->vm_private_data == handle) { + rw_wunlock(&linux_vma_lock); + kfree(vmap); + return (NULL); + } + } + TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry); + rw_wunlock(&linux_vma_lock); + return (vmap); +} + +static void +linux_cdev_handle_remove(struct vm_area_struct *vmap) +{ + if (vmap == NULL) + return; + + rw_wlock(&linux_vma_lock); + TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry); + rw_wunlock(&linux_vma_lock); + kfree(vmap); +} + +static struct vm_area_struct * +linux_cdev_handle_find(void *handle) +{ + struct vm_area_struct *vmap; + + rw_rlock(&linux_vma_lock); + TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) { + if (vmap->vm_private_data == handle) + break; + } + rw_runlock(&linux_vma_lock); + return (vmap); +} + +static int +linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred, u_short *color) +{ + const struct vm_operations_struct *vm_ops; + struct vm_area_struct *vmap; + + vmap = linux_cdev_handle_find(handle); + MPASS(vmap != NULL); + + *color = 0; + + down_write(&vmap->vm_mm->mmap_sem); + vm_ops = vmap->vm_ops; + if (likely(vm_ops != NULL)) + vm_ops->open(vmap); + up_write(&vmap->vm_mm->mmap_sem); + + return (0); +} + +static void +linux_cdev_pager_dtor(void *handle) +{ + const struct vm_operations_struct *vm_ops; + struct vm_area_struct *vmap; + + vmap = linux_cdev_handle_find(handle); + MPASS(vmap != NULL); + + down_write(&vmap->vm_mm->mmap_sem); + vm_ops = vmap->vm_ops; + if (likely(vm_ops != NULL)) + vm_ops->close(vmap); + up_write(&vmap->vm_mm->mmap_sem); + + linux_cdev_handle_remove(vmap); +} + +static struct cdev_pager_ops linux_cdev_pager_ops = { + .cdev_pg_populate = linux_cdev_pager_populate, + .cdev_pg_ctor = linux_cdev_pager_ctor, + .cdev_pg_dtor = linux_cdev_pager_dtor +}; + +static int linux_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct linux_cdev *ldev; @@ -707,10 +869,11 @@ static int linux_dev_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { + struct vm_area_struct *vmap; struct linux_file *filp; struct thread *td; struct file *file; - struct vm_area_struct vma; + vm_memattr_t attr; int error; td = curthread; @@ -720,39 +883,82 @@ linux_dev_mmap_single(struct cdev *dev, if ((error = devfs_get_cdevpriv((void **)&filp)) != 0) return (error); filp->f_flags = file->f_flag; + + if (filp->f_op->mmap == NULL) + return (ENODEV); + linux_set_current(td); - vma.vm_start = 0; - vma.vm_end = size; - vma.vm_pgoff = *offset / PAGE_SIZE; - vma.vm_pfn = 0; - vma.vm_page_prot = VM_MEMATTR_DEFAULT; - if (filp->f_op->mmap) { - error = -filp->f_op->mmap(filp, &vma); - if (error == 0) { - struct sglist *sg; - - sg = sglist_alloc(1, M_WAITOK); - sglist_append_phys(sg, - (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT, vma.vm_len); - *object = vm_pager_allocate(OBJT_SG, sg, vma.vm_len, - nprot, 0, td->td_ucred); - if (*object == NULL) { - sglist_free(sg); - error = EINVAL; - goto done; - } - *offset = 0; - if (vma.vm_page_prot != VM_MEMATTR_DEFAULT) { - VM_OBJECT_WLOCK(*object); - vm_object_set_memattr(*object, - vma.vm_page_prot); - VM_OBJECT_WUNLOCK(*object); - } + + vmap = kzalloc(sizeof(*vmap), GFP_KERNEL); + vmap->vm_start = 0; + vmap->vm_end = size; + vmap->vm_pgoff = *offset / PAGE_SIZE; + vmap->vm_pfn = 0; + vmap->vm_flags = vmap->vm_page_prot = nprot; + vmap->vm_ops = NULL; + vmap->vm_file = filp; + vmap->vm_mm = current->mm; + + if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { + error = EINTR; + } else { + error = -filp->f_op->mmap(filp, vmap); + up_write(&vmap->vm_mm->mmap_sem); + } + + if (error != 0) { + kfree(vmap); + return (error); + } + + attr = pgprot2cachemode(vmap->vm_page_prot); + + if (vmap->vm_ops != NULL) { + void *vm_private_data; + + if (vmap->vm_ops->fault == NULL || + vmap->vm_ops->open == NULL || + vmap->vm_ops->close == NULL || + vmap->vm_private_data == NULL) { + kfree(vmap); + return (EINVAL); } - } else - error = ENODEV; -done: - return (error); + + vm_private_data = vmap->vm_private_data; + + vmap = linux_cdev_handle_insert(vm_private_data, vmap); + + *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE, + &linux_cdev_pager_ops, size, nprot, *offset, curthread->td_ucred); + + if (*object == NULL) { + linux_cdev_handle_remove(vmap); + return (EINVAL); + } + } else { + struct sglist *sg; + + sg = sglist_alloc(1, M_WAITOK); + sglist_append_phys(sg, (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len); + + *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len, + nprot, 0, curthread->td_ucred); + + kfree(vmap); + + if (*object == NULL) { + sglist_free(sg); + return (EINVAL); + } + } + + if (attr != VM_MEMATTR_DEFAULT) { + VM_OBJECT_WLOCK(*object); + vm_object_set_memattr(*object, attr); + VM_OBJECT_WUNLOCK(*object); + } + *offset = 0; + return (0); } struct cdevsw linuxcdevsw = { @@ -1484,6 +1690,7 @@ linux_compat_init(void *arg) #if defined(__i386__) || defined(__amd64__) linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH); #endif + rw_init(&linux_vma_lock, "lkpi-vma-lock"); rootoid = SYSCTL_ADD_ROOT_NODE(NULL, OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); @@ -1514,6 +1721,8 @@ linux_compat_uninit(void *arg) linux_kobject_kfree_name(&linux_class_root); linux_kobject_kfree_name(&linux_root_device.kobj); linux_kobject_kfree_name(&linux_class_misc.kobj); + + rw_destroy(&linux_vma_lock); } SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201704060934.v369Ys50077560>