?? xfs_buf.c
字號:
/* * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Further, this software is distributed without any warranty that it is * free of the rightful claim of any third person regarding infringement * or the like. Any license provided herein, whether implied or * otherwise, applies only to this software file. Patent licenses, if * any, provided herein do not apply to combinations of this program with * other software, or any other product whatsoever. * * You should have received a copy of the GNU General Public License along * with this program; if not, write the Free Software Foundation, Inc., 59 * Temple Place - Suite 330, Boston MA 02111-1307, USA. * * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, * Mountain View, CA 94043, or: * * http://www.sgi.com * * For further information regarding this notice, see: * * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ *//* * The xfs_buf.c code provides an abstract buffer cache model on top * of the Linux page cache. Cached metadata blocks for a file system * are hashed to the inode for the block device. xfs_buf.c assembles * buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O. * * Written by Steve Lord, Jim Mostek, Russell Cattelan * and Rajagopal Ananthanarayanan ("ananth") at SGI. * */#include <linux/stddef.h>#include <linux/errno.h>#include <linux/slab.h>#include <linux/pagemap.h>#include <linux/init.h>#include <linux/vmalloc.h>#include <linux/bio.h>#include <linux/sysctl.h>#include <linux/proc_fs.h>#include <linux/workqueue.h>#include <linux/suspend.h>#include <linux/percpu.h>#include "xfs_linux.h"#ifndef GFP_READAHEAD#define GFP_READAHEAD (__GFP_NOWARN|__GFP_NORETRY)#endif/* * File wide globals */STATIC kmem_cache_t *pagebuf_cache;STATIC kmem_shaker_t pagebuf_shake;STATIC int pagebuf_daemon_wakeup(int, unsigned int);STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);STATIC struct workqueue_struct *pagebuf_logio_workqueue;STATIC struct workqueue_struct *pagebuf_dataio_workqueue;/* * Pagebuf debugging */#ifdef PAGEBUF_TRACEvoidpagebuf_trace( xfs_buf_t *pb, char *id, void *data, void *ra){ ktrace_enter(pagebuf_trace_buf, pb, id, (void *)(unsigned long)pb->pb_flags, (void *)(unsigned long)pb->pb_hold.counter, (void *)(unsigned long)pb->pb_sema.count.counter, (void *)current, data, ra, (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), (void *)(unsigned long)pb->pb_buffer_length, NULL, NULL, NULL, NULL, NULL);}ktrace_t *pagebuf_trace_buf;#define PAGEBUF_TRACE_SIZE 4096#define PB_TRACE(pb, id, data) \ pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))#else#define PB_TRACE(pb, id, data) do { } while (0)#endif#ifdef PAGEBUF_LOCK_TRACKING# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)#else# define PB_SET_OWNER(pb) do { } while (0)# define PB_CLEAR_OWNER(pb) do { } while (0)# define PB_GET_OWNER(pb) do { } while (0)#endif/* * Pagebuf allocation / freeing. */#define pb_to_gfp(flags) \ (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \ ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)#define pb_to_km(flags) \ (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)#define pagebuf_allocate(flags) \ kmem_zone_alloc(pagebuf_cache, pb_to_km(flags))#define pagebuf_deallocate(pb) \ kmem_zone_free(pagebuf_cache, (pb));/* * Pagebuf hashing */#define NBITS 8#define NHASH (1<<NBITS)typedef struct { struct list_head pb_hash; spinlock_t pb_hash_lock;} pb_hash_t;STATIC pb_hash_t pbhash[NHASH];#define pb_hash(pb) &pbhash[pb->pb_hash_index]STATIC int_bhash( struct block_device *bdev, loff_t base){ int bit, hval; base >>= 9; base ^= (unsigned long)bdev / L1_CACHE_BYTES; for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) { hval ^= (int)base & (NHASH-1); base >>= NBITS; } return hval;}/* * Mapping of multi-page buffers into contiguous virtual space */typedef struct a_list { void *vm_addr; struct a_list *next;} a_list_t;STATIC a_list_t *as_free_head;STATIC int as_list_len;STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED;/* * Try to batch vunmaps because they are costly. */STATIC voidfree_address( void *addr){ a_list_t *aentry; aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC); if (aentry) { spin_lock(&as_lock); aentry->next = as_free_head; aentry->vm_addr = addr; as_free_head = aentry; as_list_len++; spin_unlock(&as_lock); } else { vunmap(addr); }}STATIC voidpurge_addresses(void){ a_list_t *aentry, *old; if (as_free_head == NULL) return; spin_lock(&as_lock); aentry = as_free_head; as_free_head = NULL; as_list_len = 0; spin_unlock(&as_lock); while ((old = aentry) != NULL) { vunmap(aentry->vm_addr); aentry = aentry->next; kfree(old); }}/* * Internal pagebuf object manipulation */STATIC void_pagebuf_initialize( xfs_buf_t *pb, xfs_buftarg_t *target, loff_t range_base, size_t range_length, page_buf_flags_t flags){ /* * We don't want certain flags to appear in pb->pb_flags. */ flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); memset(pb, 0, sizeof(xfs_buf_t)); atomic_set(&pb->pb_hold, 1); init_MUTEX_LOCKED(&pb->pb_iodonesema); INIT_LIST_HEAD(&pb->pb_list); INIT_LIST_HEAD(&pb->pb_hash_list); init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ PB_SET_OWNER(pb); pb->pb_target = target; pb->pb_file_offset = range_base; /* * Set buffer_length and count_desired to the same value initially. * I/O routines should use count_desired, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ pb->pb_buffer_length = pb->pb_count_desired = range_length; pb->pb_flags = flags | PBF_NONE; pb->pb_bn = XFS_BUF_DADDR_NULL; atomic_set(&pb->pb_pin_count, 0); init_waitqueue_head(&pb->pb_waiters); XFS_STATS_INC(pb_create); PB_TRACE(pb, "initialize", target);}/* * Allocate a page array capable of holding a specified number * of pages, and point the page buf at it. */STATIC int_pagebuf_get_pages( xfs_buf_t *pb, int page_count, page_buf_flags_t flags){ /* Make sure that we have a page list */ if (pb->pb_pages == NULL) { pb->pb_offset = page_buf_poff(pb->pb_file_offset); pb->pb_page_count = page_count; if (page_count <= PB_PAGES) { pb->pb_pages = pb->pb_page_array; } else { pb->pb_pages = kmem_alloc(sizeof(struct page *) * page_count, pb_to_km(flags)); if (pb->pb_pages == NULL) return -ENOMEM; } memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); } return 0;}/* * Frees pb_pages if it was malloced. */STATIC void_pagebuf_free_pages( xfs_buf_t *bp){ if (bp->pb_pages != bp->pb_page_array) { kmem_free(bp->pb_pages, bp->pb_page_count * sizeof(struct page *)); }}/* * Releases the specified buffer. * * The modification state of any associated pages is left unchanged. * The buffer most not be on any hash - use pagebuf_rele instead for * hashed and refcounted buffers */voidpagebuf_free( xfs_buf_t *bp){ PB_TRACE(bp, "free", 0); ASSERT(list_empty(&bp->pb_hash_list)); if (bp->pb_flags & _PBF_PAGE_CACHE) { uint i; if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1)) free_address(bp->pb_addr - bp->pb_offset); for (i = 0; i < bp->pb_page_count; i++) page_cache_release(bp->pb_pages[i]); _pagebuf_free_pages(bp); } else if (bp->pb_flags & _PBF_KMEM_ALLOC) { /* * XXX(hch): bp->pb_count_desired might be incorrect (see * pagebuf_associate_memory for details), but fortunately * the Linux version of kmem_free ignores the len argument.. */ kmem_free(bp->pb_addr, bp->pb_count_desired); _pagebuf_free_pages(bp); } pagebuf_deallocate(bp);}/* * Finds all pages for buffer in question and builds it's page list. */STATIC int_pagebuf_lookup_pages( xfs_buf_t *bp, uint flags){ struct address_space *mapping = bp->pb_target->pbr_mapping; unsigned int sectorshift = bp->pb_target->pbr_sshift; size_t blocksize = bp->pb_target->pbr_bsize; size_t size = bp->pb_count_desired; size_t nbytes, offset; int gfp_mask = pb_to_gfp(flags); unsigned short page_count, i; pgoff_t first; loff_t end; int error; end = bp->pb_file_offset + bp->pb_buffer_length; page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset); error = _pagebuf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; bp->pb_flags |= _PBF_PAGE_CACHE; offset = bp->pb_offset; first = bp->pb_file_offset >> PAGE_CACHE_SHIFT; for (i = 0; i < bp->pb_page_count; i++) { struct page *page; uint retries = 0; retry: page = find_or_create_page(mapping, first + i, gfp_mask); if (unlikely(page == NULL)) { if (flags & PBF_READ_AHEAD) { bp->pb_page_count = i; for (i = 0; i < bp->pb_page_count; i++) unlock_page(bp->pb_pages[i]); return -ENOMEM; } /* * This could deadlock. * * But until all the XFS lowlevel code is revamped to * handle buffer allocation failures we can't do much. */ if (!(++retries % 100)) printk(KERN_ERR "possible deadlock in %s (mode:0x%x)\n", __FUNCTION__, gfp_mask); XFS_STATS_INC(pb_page_retries); pagebuf_daemon_wakeup(0, gfp_mask); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(10); goto retry; } XFS_STATS_INC(pb_page_found); nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); size -= nbytes; if (!PageUptodate(page)) { page_count--; if (blocksize == PAGE_CACHE_SIZE) { if (flags & PBF_READ) bp->pb_locked = 1; } else if (!PagePrivate(page)) { unsigned long j, range; /* * In this case page->private holds a bitmap * of uptodate sectors within the page */ ASSERT(blocksize < PAGE_CACHE_SIZE); range = (offset + nbytes) >> sectorshift; for (j = offset >> sectorshift; j < range; j++) if (!test_bit(j, &page->private)) break; if (j == range) page_count++; } } bp->pb_pages[i] = page; offset = 0; } if (!bp->pb_locked) { for (i = 0; i < bp->pb_page_count; i++) unlock_page(bp->pb_pages[i]); } if (page_count) { /* if we have any uptodate pages, mark that in the buffer */ bp->pb_flags &= ~PBF_NONE; /* if some pages aren't uptodate, mark that in the buffer */ if (page_count != bp->pb_page_count) bp->pb_flags |= PBF_PARTIAL; } PB_TRACE(bp, "lookup_pages", (long)page_count); return error;}/* * Map buffer into kernel address-space if nessecary. */STATIC int_pagebuf_map_pages( xfs_buf_t *bp, uint flags){ /* A single page buffer is always mappable */ if (bp->pb_page_count == 1) { bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset; bp->pb_flags |= PBF_MAPPED; } else if (flags & PBF_MAPPED) { if (as_list_len > 64) purge_addresses(); bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count, VM_MAP, PAGE_KERNEL); if (unlikely(bp->pb_addr == NULL)) return -ENOMEM; bp->pb_addr += bp->pb_offset; bp->pb_flags |= PBF_MAPPED; } return 0;}/* * Finding and Reading Buffers *//* * _pagebuf_find * * Looks up, and creates if absent, a lockable buffer for * a given range of an inode. The buffer is returned * locked. If other overlapping buffers exist, they are * released before the new buffer is created and locked, * which may imply that this call will block until those buffers * are unlocked. No I/O is implied by this call. */STATIC xfs_buf_t *_pagebuf_find( /* find buffer for block */ xfs_buftarg_t *target,/* target for block */ loff_t ioff, /* starting offset of range */ size_t isize, /* length of range */ page_buf_flags_t flags, /* PBF_TRYLOCK */ xfs_buf_t *new_pb)/* newly allocated buffer */{ loff_t range_base; size_t range_length; int hval; pb_hash_t *h; xfs_buf_t *pb, *n; int not_locked; range_base = (ioff << BBSHIFT); range_length = (isize << BBSHIFT); /* Ensure we never do IOs smaller than the sector size */ BUG_ON(range_length < (1 << target->pbr_sshift)); /* Ensure we never do IOs that are not sector aligned */ BUG_ON(range_base & (loff_t)target->pbr_smask); hval = _bhash(target->pbr_bdev, range_base); h = &pbhash[hval]; spin_lock(&h->pb_hash_lock); list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) { if (pb->pb_target == target && pb->pb_file_offset == range_base && pb->pb_buffer_length == range_length) { /* If we look at something bring it to the * front of the list for next time */ atomic_inc(&pb->pb_hold); list_move(&pb->pb_hash_list, &h->pb_hash); goto found; } } /* No match found */ if (new_pb) { _pagebuf_initialize(new_pb, target, range_base, range_length, flags); new_pb->pb_hash_index = hval; list_add(&new_pb->pb_hash_list, &h->pb_hash); } else { XFS_STATS_INC(pb_miss_locked); } spin_unlock(&h->pb_hash_lock); return (new_pb);found: spin_unlock(&h->pb_hash_lock); /* Attempt to get the semaphore without sleeping, * if this does not work then we need to drop the * spinlock and do a hard attempt on the semaphore. */ not_locked = down_trylock(&pb->pb_sema); if (not_locked) { if (!(flags & PBF_TRYLOCK)) { /* wait for buffer ownership */ PB_TRACE(pb, "get_lock", 0); pagebuf_lock(pb); XFS_STATS_INC(pb_get_locked_waited); } else { /* We asked for a trylock and failed, no need * to look at file offset and length here, we * know that this pagebuf at least overlaps our * pagebuf and is locked, therefore our buffer * either does not exist, or is this buffer */ pagebuf_rele(pb); XFS_STATS_INC(pb_busy_locked); return (NULL); } } else { /* trylock worked */ PB_SET_OWNER(pb); } if (pb->pb_flags & PBF_STALE) pb->pb_flags &= PBF_MAPPED; PB_TRACE(pb, "got_lock", 0); XFS_STATS_INC(pb_get_locked); return (pb);}/* * pagebuf_find * * pagebuf_find returns a buffer matching the specified range of * data for the specified target, if any of the relevant blocks * are in memory. The buffer may have unallocated holes, if * some, but not all, of the blocks are in memory. Even where * pages are present in the buffer, not all of every page may be * valid. */xfs_buf_t *pagebuf_find( /* find buffer for block */ /* if the block is in memory */ xfs_buftarg_t *target,/* target for block */ loff_t ioff, /* starting offset of range */ size_t isize, /* length of range */ page_buf_flags_t flags) /* PBF_TRYLOCK */{ return _pagebuf_find(target, ioff, isize, flags, NULL);}/* * pagebuf_get * * pagebuf_get assembles a buffer covering the specified range. * Some or all of the blocks in the range may be valid. Storage * in memory for all portions of the buffer will be allocated, * although backing storage may not be. If PBF_READ is set in * flags, pagebuf_iostart is called also. */xfs_buf_t *pagebuf_get( /* allocate a buffer */
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -