Kernel  |  3.10

下载     查看原文件
C++程序  |  1067行  |  31.88 KB
/*
 * zbud.c - Compression buddies allocator
 *
 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
 *
 * Compression buddies ("zbud") provides for efficiently packing two
 * (or, possibly in the future, more) compressed pages ("zpages") into
 * a single "raw" pageframe and for tracking both zpages and pageframes
 * so that whole pageframes can be easily reclaimed in LRU-like order.
 * It is designed to be used in conjunction with transcendent memory
 * ("tmem"); for example separate LRU lists are maintained for persistent
 * vs. ephemeral pages.
 *
 * A zbudpage is an overlay for a struct page and thus each zbudpage
 * refers to a physical pageframe of RAM.  When the caller passes a
 * struct page from the kernel's page allocator, zbud "transforms" it
 * to a zbudpage which sets/uses a different set of fields than the
 * struct-page and thus must "untransform" it back by reinitializing
 * certain fields before the struct-page can be freed.  The fields
 * of a zbudpage include a page lock for controlling access to the
 * corresponding pageframe, and there is a size field for each zpage.
 * Each zbudpage also lives on two linked lists: a "budlist" which is
 * used to support efficient buddying of zpages; and an "lru" which
 * is used for reclaiming pageframes in approximately least-recently-used
 * order.
 *
 * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
 * which contain the compressed data for zero, one, or two zbuds.  Contained
 * with the compressed data is a tmem_handle which is a key to allow
 * the same data to be found via the tmem interface so the zpage can
 * be invalidated (for ephemeral pages) or repatriated to the swap cache
 * (for persistent pages).  The contents of a zbudpageframe must never
 * be accessed without holding the page lock for the corresponding
 * zbudpage and, to accomodate highmem machines, the contents may
 * only be examined or changes when kmapped.  Thus, when in use, a
 * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
 *
 * Note that the term "zbud" refers to the combination of a zpage and
 * a tmem_handle that is stored as one of possibly two "buddied" zpages;
 * it also generically refers to this allocator... sorry for any confusion.
 *
 * A zbudref is a pointer to a struct zbudpage (which can be cast to a
 * struct page), with the LSB either cleared or set to indicate, respectively,
 * the first or second zpage in the zbudpageframe. Since a zbudref can be
 * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
 * references a stored tmem page and so is the only zbud data structure
 * externally visible to zbud.c/zbud.h.
 *
 * Since we wish to reclaim entire pageframes but zpages may be randomly
 * added and deleted to any given pageframe, we approximate LRU by
 * promoting a pageframe to MRU when a zpage is added to it, but
 * leaving it at the current place in the list when a zpage is deleted
 * from it.  As a side effect, zpages that are difficult to buddy (e.g.
 * very large paages) will be reclaimed faster than average, which seems
 * reasonable.
 *
 * In the current implementation, no more than two zpages may be stored in
 * any pageframe and no zpage ever crosses a pageframe boundary.  While
 * other zpage allocation mechanisms may allow greater density, this two
 * zpage-per-pageframe limit both ensures simple reclaim of pageframes
 * (including garbage collection of references to the contents of those
 * pageframes from tmem data structures) AND avoids the need for compaction.
 * With additional complexity, zbud could be modified to support storing
 * up to three zpages per pageframe or, to handle larger average zpages,
 * up to three zpages per pair of pageframes, but it is not clear if the
 * additional complexity would be worth it.  So consider it an exercise
 * for future developers.
 *
 * Note also that zbud does no page allocation or freeing.  This is so
 * that the caller has complete control over and, for accounting, visibility
 * into if/when pages are allocated and freed.
 *
 * Finally, note that zbud limits the size of zpages it can store; the
 * caller must check the zpage size with zbud_max_buddy_size before
 * storing it, else BUGs will result.  User beware.
 */

#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include "tmem.h"
#include "zcache.h"
#include "zbud.h"

/*
 * We need to ensure that a struct zbudpage is never larger than a
 * struct page.  This is checked with a BUG_ON in zbud_init.
 *
 * The unevictable field indicates that a zbud is being added to the
 * zbudpage.  Since this is a two-phase process (due to tmem locking),
 * this field locks the zbudpage against eviction when a zbud match
 * or creation is in process.  Since this addition process may occur
 * in parallel for two zbuds in one zbudpage, the field is a counter
 * that must not exceed two.
 */
struct zbudpage {
	union {
		struct page page;
		struct {
			unsigned long space_for_flags;
			struct {
				unsigned zbud0_size:PAGE_SHIFT;
				unsigned zbud1_size:PAGE_SHIFT;
				unsigned unevictable:2;
			};
			struct list_head budlist;
			struct list_head lru;
		};
	};
};
#if (PAGE_SHIFT * 2) + 2 > BITS_PER_LONG
#error "zbud won't work for this arch, PAGE_SIZE is too large"
#endif

struct zbudref {
	union {
		struct zbudpage *zbudpage;
		unsigned long zbudref;
	};
};

#define CHUNK_SHIFT	6
#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
#define CHUNK_MASK	(~(CHUNK_SIZE-1))
#define NCHUNKS		(PAGE_SIZE >> CHUNK_SHIFT)
#define MAX_CHUNK	(NCHUNKS-1)

/*
 * The following functions deal with the difference between struct
 * page and struct zbudpage.  Note the hack of using the pageflags
 * from struct page; this is to avoid duplicating all the complex
 * pageflag macros.
 */
static inline void zbudpage_spin_lock(struct zbudpage *zbudpage)
{
	struct page *page = (struct page *)zbudpage;

	while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) {
		do {
			cpu_relax();
		} while (test_bit(PG_locked, &page->flags));
	}
}

static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage)
{
	struct page *page = (struct page *)zbudpage;

	clear_bit(PG_locked, &page->flags);
}

static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage)
{
	return trylock_page((struct page *)zbudpage);
}

static inline int zbudpage_is_locked(struct zbudpage *zbudpage)
{
	return PageLocked((struct page *)zbudpage);
}

static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage)
{
	return kmap_atomic((struct page *)zbudpage);
}

/*
 * A dying zbudpage is an ephemeral page in the process of being evicted.
 * Any data contained in the zbudpage is invalid and we are just waiting for
 * the tmem pampds to be invalidated before freeing the page
 */
static inline int zbudpage_is_dying(struct zbudpage *zbudpage)
{
	struct page *page = (struct page *)zbudpage;

	return test_bit(PG_reclaim, &page->flags);
}

static inline void zbudpage_set_dying(struct zbudpage *zbudpage)
{
	struct page *page = (struct page *)zbudpage;

	set_bit(PG_reclaim, &page->flags);
}

static inline void zbudpage_clear_dying(struct zbudpage *zbudpage)
{
	struct page *page = (struct page *)zbudpage;

	clear_bit(PG_reclaim, &page->flags);
}

/*
 * A zombie zbudpage is a persistent page in the process of being evicted.
 * The data contained in the zbudpage is valid and we are just waiting for
 * the tmem pampds to be invalidated before freeing the page
 */
static inline int zbudpage_is_zombie(struct zbudpage *zbudpage)
{
	struct page *page = (struct page *)zbudpage;

	return test_bit(PG_dirty, &page->flags);
}

static inline void zbudpage_set_zombie(struct zbudpage *zbudpage)
{
	struct page *page = (struct page *)zbudpage;

	set_bit(PG_dirty, &page->flags);
}

static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage)
{
	struct page *page = (struct page *)zbudpage;

	clear_bit(PG_dirty, &page->flags);
}

static inline void kunmap_zbudpage_atomic(void *zbpg)
{
	kunmap_atomic(zbpg);
}

/*
 * zbud "translation" and helper functions
 */

static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref)
{
	unsigned long zbud = (unsigned long)zref;
	zbud &= ~1UL;
	return (struct zbudpage *)zbud;
}

static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage,
							unsigned budnum)
{
	unsigned long zbud = (unsigned long)zbudpage;
	BUG_ON(budnum > 1);
	zbud |= budnum;
	return (struct zbudref *)zbud;
}

static inline int zbudref_budnum(struct zbudref *zbudref)
{
	unsigned long zbud = (unsigned long)zbudref;
	return zbud & 1UL;
}

static inline unsigned zbud_max_size(void)
{
	return MAX_CHUNK << CHUNK_SHIFT;
}

static inline unsigned zbud_size_to_chunks(unsigned size)
{
	BUG_ON(size == 0 || size > zbud_max_size());
	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
}

/* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
static inline char *zbud_data(void *zbpg,
			unsigned budnum, unsigned size)
{
	char *p;

	BUG_ON(size == 0 || size > zbud_max_size());
	p = (char *)zbpg;
	if (budnum == 1)
		p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
	return p;
}

/*
 * These are all informative and exposed through debugfs... except for
 * the arrays... anyone know how to do that?  To avoid confusion for
 * debugfs viewers, some of these should also be atomic_long_t, but
 * I don't know how to expose atomics via debugfs either...
 */
static ssize_t zbud_eph_pageframes;
static ssize_t zbud_pers_pageframes;
static ssize_t zbud_eph_zpages;
static ssize_t zbud_pers_zpages;
static u64 zbud_eph_zbytes;
static u64 zbud_pers_zbytes;
static ssize_t zbud_eph_evicted_pageframes;
static ssize_t zbud_pers_evicted_pageframes;
static ssize_t zbud_eph_cumul_zpages;
static ssize_t zbud_pers_cumul_zpages;
static u64 zbud_eph_cumul_zbytes;
static u64 zbud_pers_cumul_zbytes;
static ssize_t zbud_eph_cumul_chunk_counts[NCHUNKS];
static ssize_t zbud_pers_cumul_chunk_counts[NCHUNKS];
static ssize_t zbud_eph_buddied_count;
static ssize_t zbud_pers_buddied_count;
static ssize_t zbud_eph_unbuddied_count;
static ssize_t zbud_pers_unbuddied_count;
static ssize_t zbud_eph_zombie_count;
static ssize_t zbud_pers_zombie_count;
static atomic_t zbud_eph_zombie_atomic;
static atomic_t zbud_pers_zombie_atomic;

#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#define	zdfs	debugfs_create_size_t
#define	zdfs64	debugfs_create_u64
static int zbud_debugfs_init(void)
{
	struct dentry *root = debugfs_create_dir("zbud", NULL);
	if (root == NULL)
		return -ENXIO;

	/*
	 * would be nice to dump the sizes of the unbuddied
	 * arrays, like was done with sysfs, but it doesn't
	 * look like debugfs is flexible enough to do that
	 */
	zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes);
	zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes);
	zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes);
	zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes);
	zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages);
	zdfs("eph_evicted_pageframes", S_IRUGO, root,
				&zbud_eph_evicted_pageframes);
	zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages);
	zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes);
	zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count);
	zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count);
	zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages);
	zdfs("pers_evicted_pageframes", S_IRUGO, root,
				&zbud_pers_evicted_pageframes);
	zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages);
	zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes);
	zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count);
	zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count);
	zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count);
	return 0;
}
#undef	zdfs
#undef	zdfs64
#else
static inline int zbud_debugfs_init(void)
{
	return 0;
}
#endif

/* protects the buddied list and all unbuddied lists */
static DEFINE_SPINLOCK(zbud_eph_lists_lock);
static DEFINE_SPINLOCK(zbud_pers_lists_lock);

struct zbud_unbuddied {
	struct list_head list;
	unsigned count;
};

/* list N contains pages with N chunks USED and NCHUNKS-N unused */
/* element 0 is never used but optimizing that isn't worth it */
static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS];
static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS];
static LIST_HEAD(zbud_eph_lru_list);
static LIST_HEAD(zbud_pers_lru_list);
static LIST_HEAD(zbud_eph_buddied_list);
static LIST_HEAD(zbud_pers_buddied_list);
static LIST_HEAD(zbud_eph_zombie_list);
static LIST_HEAD(zbud_pers_zombie_list);

/*
 * Given a struct page, transform it to a zbudpage so that it can be
 * used by zbud and initialize fields as necessary.
 */
static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph)
{
	struct zbudpage *zbudpage = (struct zbudpage *)page;

	BUG_ON(page == NULL);
	INIT_LIST_HEAD(&zbudpage->budlist);
	INIT_LIST_HEAD(&zbudpage->lru);
	zbudpage->zbud0_size = 0;
	zbudpage->zbud1_size = 0;
	zbudpage->unevictable = 0;
	if (eph)
		zbud_eph_pageframes++;
	else
		zbud_pers_pageframes++;
	return zbudpage;
}

/* "Transform" a zbudpage back to a struct page suitable to free. */
static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
								bool eph)
{
	struct page *page = (struct page *)zbudpage;

	BUG_ON(!list_empty(&zbudpage->budlist));
	BUG_ON(!list_empty(&zbudpage->lru));
	BUG_ON(zbudpage->zbud0_size != 0);
	BUG_ON(zbudpage->zbud1_size != 0);
	BUG_ON(!PageLocked(page));
	BUG_ON(zbudpage->unevictable != 0);
	BUG_ON(zbudpage_is_dying(zbudpage));
	BUG_ON(zbudpage_is_zombie(zbudpage));
	if (eph)
		zbud_eph_pageframes--;
	else
		zbud_pers_pageframes--;
	zbudpage_spin_unlock(zbudpage);
	page_mapcount_reset(page);
	init_page_count(page);
	page->index = 0;
	return page;
}

/* Mark a zbud as unused and do accounting */
static inline void zbud_unuse_zbud(struct zbudpage *zbudpage,
					int budnum, bool eph)
{
	unsigned size;

	BUG_ON(!zbudpage_is_locked(zbudpage));
	if (budnum == 0) {
		size = zbudpage->zbud0_size;
		zbudpage->zbud0_size = 0;
	} else {
		size = zbudpage->zbud1_size;
		zbudpage->zbud1_size = 0;
	}
	if (eph) {
		zbud_eph_zbytes -= size;
		zbud_eph_zpages--;
	} else {
		zbud_pers_zbytes -= size;
		zbud_pers_zpages--;
	}
}

/*
 * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
 * to some data, set up the zbud appropriately including data copying
 * and accounting.  Note that if cdata is NULL, the data copying is
 * skipped.  (This is useful for lazy writes such as for RAMster.)
 */
static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th,
				bool eph, void *cdata,
				unsigned budnum, unsigned size)
{
	char *to;
	void *zbpg;
	struct tmem_handle *to_th;
	unsigned nchunks = zbud_size_to_chunks(size);

	BUG_ON(!zbudpage_is_locked(zbudpage));
	zbpg = kmap_zbudpage_atomic(zbudpage);
	to = zbud_data(zbpg, budnum, size);
	to_th = (struct tmem_handle *)to;
	to_th->index = th->index;
	to_th->oid = th->oid;
	to_th->pool_id = th->pool_id;
	to_th->client_id = th->client_id;
	to += sizeof(struct tmem_handle);
	if (cdata != NULL)
		memcpy(to, cdata, size - sizeof(struct tmem_handle));
	kunmap_zbudpage_atomic(zbpg);
	if (budnum == 0)
		zbudpage->zbud0_size = size;
	else
		zbudpage->zbud1_size = size;
	if (eph) {
		zbud_eph_cumul_chunk_counts[nchunks]++;
		zbud_eph_zpages++;
		zbud_eph_cumul_zpages++;
		zbud_eph_zbytes += size;
		zbud_eph_cumul_zbytes += size;
	} else {
		zbud_pers_cumul_chunk_counts[nchunks]++;
		zbud_pers_zpages++;
		zbud_pers_cumul_zpages++;
		zbud_pers_zbytes += size;
		zbud_pers_cumul_zbytes += size;
	}
}

/*
 * Given a locked dying zbudpage, read out the tmem handles from the data,
 * unlock the page, then use the handles to tell tmem to flush out its
 * references
 */
static void zbud_evict_tmem(struct zbudpage *zbudpage)
{
	int i, j;
	uint32_t pool_id[2], client_id[2];
	uint32_t index[2];
	struct tmem_oid oid[2];
	struct tmem_pool *pool;
	void *zbpg;
	struct tmem_handle *th;
	unsigned size;

	/* read out the tmem handles from the data and set aside */
	zbpg = kmap_zbudpage_atomic(zbudpage);
	for (i = 0, j = 0; i < 2; i++) {
		size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
		if (size) {
			th = (struct tmem_handle *)zbud_data(zbpg, i, size);
			client_id[j] = th->client_id;
			pool_id[j] = th->pool_id;
			oid[j] = th->oid;
			index[j] = th->index;
			j++;
			zbud_unuse_zbud(zbudpage, i, true);
		}
	}
	kunmap_zbudpage_atomic(zbpg);
	zbudpage_spin_unlock(zbudpage);
	/* zbudpage is now an unlocked dying... tell tmem to flush pointers */
	for (i = 0; i < j; i++) {
		pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
		if (pool != NULL) {
			tmem_flush_page(pool, &oid[i], index[i]);
			zcache_put_pool(pool);
		}
	}
}

/*
 * Externally callable zbud handling routines.
 */

/*
 * Return the maximum size compressed page that can be stored (secretly
 * setting aside space for the tmem handle.
 */
unsigned int zbud_max_buddy_size(void)
{
	return zbud_max_size() - sizeof(struct tmem_handle);
}

/*
 * Given a zbud reference, free the corresponding zbud from all lists,
 * mark it as unused, do accounting, and if the freeing of the zbud
 * frees up an entire pageframe, return it to the caller (else NULL).
 */
struct page *zbud_free_and_delist(struct zbudref *zref, bool eph,
				  unsigned int *zsize, unsigned int *zpages)
{
	unsigned long budnum = zbudref_budnum(zref);
	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
	struct page *page = NULL;
	unsigned chunks, bud_size, other_bud_size;
	spinlock_t *lists_lock =
		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
	struct zbud_unbuddied *unbud =
		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;


	spin_lock(lists_lock);
	zbudpage_spin_lock(zbudpage);
	if (zbudpage_is_dying(zbudpage)) {
		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
		zbudpage_spin_unlock(zbudpage);
		spin_unlock(lists_lock);
		*zpages = 0;
		*zsize = 0;
		goto out;
	}
	if (budnum == 0) {
		bud_size = zbudpage->zbud0_size;
		other_bud_size = zbudpage->zbud1_size;
	} else {
		bud_size = zbudpage->zbud1_size;
		other_bud_size = zbudpage->zbud0_size;
	}
	*zsize = bud_size - sizeof(struct tmem_handle);
	*zpages = 1;
	zbud_unuse_zbud(zbudpage, budnum, eph);
	if (other_bud_size == 0) { /* was unbuddied: unlist and free */
		chunks = zbud_size_to_chunks(bud_size) ;
		if (zbudpage_is_zombie(zbudpage)) {
			if (eph)
				zbud_pers_zombie_count =
				  atomic_dec_return(&zbud_eph_zombie_atomic);
			else
				zbud_pers_zombie_count =
				  atomic_dec_return(&zbud_pers_zombie_atomic);
			zbudpage_clear_zombie(zbudpage);
		} else {
			BUG_ON(list_empty(&unbud[chunks].list));
			list_del_init(&zbudpage->budlist);
			unbud[chunks].count--;
		}
		list_del_init(&zbudpage->lru);
		spin_unlock(lists_lock);
		if (eph)
			zbud_eph_unbuddied_count--;
		else
			zbud_pers_unbuddied_count--;
		page = zbud_unuse_zbudpage(zbudpage, eph);
	} else { /* was buddied: move remaining buddy to unbuddied list */
		chunks = zbud_size_to_chunks(other_bud_size) ;
		if (!zbudpage_is_zombie(zbudpage)) {
			list_del_init(&zbudpage->budlist);
			list_add_tail(&zbudpage->budlist, &unbud[chunks].list);
			unbud[chunks].count++;
		}
		if (eph) {
			zbud_eph_buddied_count--;
			zbud_eph_unbuddied_count++;
		} else {
			zbud_pers_unbuddied_count++;
			zbud_pers_buddied_count--;
		}
		/* don't mess with lru, no need to move it */
		zbudpage_spin_unlock(zbudpage);
		spin_unlock(lists_lock);
	}
out:
	return page;
}

/*
 * Given a tmem handle, and a kmapped pointer to compressed data of
 * the given size, try to find an unbuddied zbudpage in which to
 * create a zbud. If found, put it there, mark the zbudpage unevictable,
 * and return a zbudref to it.  Else return NULL.
 */
struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph,
				void *cdata, unsigned size)
{
	struct zbudpage *zbudpage = NULL, *zbudpage2;
	unsigned long budnum = 0UL;
	unsigned nchunks;
	int i, found_good_buddy = 0;
	spinlock_t *lists_lock =
		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
	struct zbud_unbuddied *unbud =
		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;

	size += sizeof(struct tmem_handle);
	nchunks = zbud_size_to_chunks(size);
	for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
		spin_lock(lists_lock);
		if (!list_empty(&unbud[i].list)) {
			list_for_each_entry_safe(zbudpage, zbudpage2,
				    &unbud[i].list, budlist) {
				if (zbudpage_spin_trylock(zbudpage)) {
					found_good_buddy = i;
					goto found_unbuddied;
				}
			}
		}
		spin_unlock(lists_lock);
	}
	zbudpage = NULL;
	goto out;

found_unbuddied:
	BUG_ON(!zbudpage_is_locked(zbudpage));
	BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0)));
	if (zbudpage->zbud0_size == 0)
		budnum = 0UL;
	else if (zbudpage->zbud1_size == 0)
		budnum = 1UL;
	list_del_init(&zbudpage->budlist);
	if (eph) {
		list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list);
		unbud[found_good_buddy].count--;
		zbud_eph_unbuddied_count--;
		zbud_eph_buddied_count++;
		/* "promote" raw zbudpage to most-recently-used */
		list_del_init(&zbudpage->lru);
		list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
	} else {
		list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list);
		unbud[found_good_buddy].count--;
		zbud_pers_unbuddied_count--;
		zbud_pers_buddied_count++;
		/* "promote" raw zbudpage to most-recently-used */
		list_del_init(&zbudpage->lru);
		list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
	}
	zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
	zbudpage->unevictable++;
	BUG_ON(zbudpage->unevictable == 3);
	zbudpage_spin_unlock(zbudpage);
	spin_unlock(lists_lock);
out:
	return zbudpage_to_zbudref(zbudpage, budnum);

}

/*
 * Given a tmem handle, and a kmapped pointer to compressed data of
 * the given size, and a newly allocated struct page, create an unevictable
 * zbud in that new page and return a zbudref to it.
 */
struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph,
					void *cdata, unsigned size,
					struct page *newpage)
{
	struct zbudpage *zbudpage;
	unsigned long budnum = 0;
	unsigned nchunks;
	spinlock_t *lists_lock =
		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
	struct zbud_unbuddied *unbud =
		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;

#if 0
	/* this may be worth it later to support decompress-in-place? */
	static unsigned long counter;
	budnum = counter++ & 1;	/* alternate using zbud0 and zbud1 */
#endif

	if (size  > zbud_max_buddy_size())
		return NULL;
	if (newpage == NULL)
		return NULL;

	size += sizeof(struct tmem_handle);
	nchunks = zbud_size_to_chunks(size) ;
	spin_lock(lists_lock);
	zbudpage = zbud_init_zbudpage(newpage, eph);
	zbudpage_spin_lock(zbudpage);
	list_add_tail(&zbudpage->budlist, &unbud[nchunks].list);
	if (eph) {
		list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
		zbud_eph_unbuddied_count++;
	} else {
		list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
		zbud_pers_unbuddied_count++;
	}
	unbud[nchunks].count++;
	zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
	zbudpage->unevictable++;
	BUG_ON(zbudpage->unevictable == 3);
	zbudpage_spin_unlock(zbudpage);
	spin_unlock(lists_lock);
	return zbudpage_to_zbudref(zbudpage, budnum);
}

/*
 * Finish creation of a zbud by, assuming another zbud isn't being created
 * in parallel, marking it evictable.
 */
void zbud_create_finish(struct zbudref *zref, bool eph)
{
	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
	spinlock_t *lists_lock =
		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;

	spin_lock(lists_lock);
	zbudpage_spin_lock(zbudpage);
	BUG_ON(zbudpage_is_dying(zbudpage));
	zbudpage->unevictable--;
	BUG_ON((int)zbudpage->unevictable < 0);
	zbudpage_spin_unlock(zbudpage);
	spin_unlock(lists_lock);
}

/*
 * Given a zbudref and a struct page, decompress the data from
 * the zbud into the physical page represented by the struct page
 * by upcalling to zcache_decompress
 */
int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph,
			void (*decompress)(char *, unsigned int, char *))
{
	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
	unsigned long budnum = zbudref_budnum(zref);
	void *zbpg;
	char *to_va, *from_va;
	unsigned size;
	int ret = -1;
	spinlock_t *lists_lock =
		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;

	spin_lock(lists_lock);
	zbudpage_spin_lock(zbudpage);
	if (zbudpage_is_dying(zbudpage)) {
		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
		goto out;
	}
	zbpg = kmap_zbudpage_atomic(zbudpage);
	to_va = kmap_atomic(data_page);
	if (budnum == 0)
		size = zbudpage->zbud0_size;
	else
		size = zbudpage->zbud1_size;
	BUG_ON(size == 0 || size > zbud_max_size());
	from_va = zbud_data(zbpg, budnum, size);
	from_va += sizeof(struct tmem_handle);
	size -= sizeof(struct tmem_handle);
	decompress(from_va, size, to_va);
	kunmap_atomic(to_va);
	kunmap_zbudpage_atomic(zbpg);
	ret = 0;
out:
	zbudpage_spin_unlock(zbudpage);
	spin_unlock(lists_lock);
	return ret;
}

/*
 * Given a zbudref and a kernel pointer, copy the data from
 * the zbud to the kernel pointer.
 */
int zbud_copy_from_zbud(char *to_va, struct zbudref *zref,
				size_t *sizep, bool eph)
{
	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
	unsigned long budnum = zbudref_budnum(zref);
	void *zbpg;
	char *from_va;
	unsigned size;
	int ret = -1;
	spinlock_t *lists_lock =
		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;

	spin_lock(lists_lock);
	zbudpage_spin_lock(zbudpage);
	if (zbudpage_is_dying(zbudpage)) {
		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
		goto out;
	}
	zbpg = kmap_zbudpage_atomic(zbudpage);
	if (budnum == 0)
		size = zbudpage->zbud0_size;
	else
		size = zbudpage->zbud1_size;
	BUG_ON(size == 0 || size > zbud_max_size());
	from_va = zbud_data(zbpg, budnum, size);
	from_va += sizeof(struct tmem_handle);
	size -= sizeof(struct tmem_handle);
	*sizep = size;
	memcpy(to_va, from_va, size);

	kunmap_zbudpage_atomic(zbpg);
	ret = 0;
out:
	zbudpage_spin_unlock(zbudpage);
	spin_unlock(lists_lock);
	return ret;
}

/*
 * Given a zbudref and a kernel pointer, copy the data from
 * the kernel pointer to the zbud.
 */
int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph)
{
	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
	unsigned long budnum = zbudref_budnum(zref);
	void *zbpg;
	char *to_va;
	unsigned size;
	int ret = -1;
	spinlock_t *lists_lock =
		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;

	spin_lock(lists_lock);
	zbudpage_spin_lock(zbudpage);
	if (zbudpage_is_dying(zbudpage)) {
		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
		goto out;
	}
	zbpg = kmap_zbudpage_atomic(zbudpage);
	if (budnum == 0)
		size = zbudpage->zbud0_size;
	else
		size = zbudpage->zbud1_size;
	BUG_ON(size == 0 || size > zbud_max_size());
	to_va = zbud_data(zbpg, budnum, size);
	to_va += sizeof(struct tmem_handle);
	size -= sizeof(struct tmem_handle);
	memcpy(to_va, from_va, size);

	kunmap_zbudpage_atomic(zbpg);
	ret = 0;
out:
	zbudpage_spin_unlock(zbudpage);
	spin_unlock(lists_lock);
	return ret;
}

/*
 * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
 * there are no references to it remaining, and return the now unused
 * (and re-init'ed) struct page and the total amount of compressed
 * data that was evicted.
 */
struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages)
{
	struct zbudpage *zbudpage = NULL, *zbudpage2;
	struct zbud_unbuddied *unbud = zbud_eph_unbuddied;
	struct page *page = NULL;
	bool irqs_disabled = irqs_disabled();

	/*
	 * Since this can be called indirectly from cleancache_put, which
	 * has interrupts disabled, as well as frontswap_put, which does not,
	 * we need to be able to handle both cases, even though it is ugly.
	 */
	if (irqs_disabled)
		spin_lock(&zbud_eph_lists_lock);
	else
		spin_lock_bh(&zbud_eph_lists_lock);
	*zsize = 0;
	if (list_empty(&zbud_eph_lru_list))
		goto unlock_out;
	list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) {
		/* skip a locked zbudpage */
		if (unlikely(!zbudpage_spin_trylock(zbudpage)))
			continue;
		/* skip an unevictable zbudpage */
		if (unlikely(zbudpage->unevictable != 0)) {
			zbudpage_spin_unlock(zbudpage);
			continue;
		}
		/* got a locked evictable page */
		goto evict_page;

	}
unlock_out:
	/* no unlocked evictable pages, give up */
	if (irqs_disabled)
		spin_unlock(&zbud_eph_lists_lock);
	else
		spin_unlock_bh(&zbud_eph_lists_lock);
	goto out;

evict_page:
	list_del_init(&zbudpage->budlist);
	list_del_init(&zbudpage->lru);
	zbudpage_set_dying(zbudpage);
	/*
	 * the zbudpage is now "dying" and attempts to read, write,
	 * or delete data from it will be ignored
	 */
	if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size !=  0) {
		*zsize = zbudpage->zbud0_size + zbudpage->zbud1_size -
				(2 * sizeof(struct tmem_handle));
		*zpages = 2;
	} else if (zbudpage->zbud0_size != 0) {
		unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--;
		*zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle);
		*zpages = 1;
	} else if (zbudpage->zbud1_size != 0) {
		unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--;
		*zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle);
		*zpages = 1;
	} else {
		BUG();
	}
	spin_unlock(&zbud_eph_lists_lock);
	zbud_eph_evicted_pageframes++;
	if (*zpages == 1)
		zbud_eph_unbuddied_count--;
	else
		zbud_eph_buddied_count--;
	zbud_evict_tmem(zbudpage);
	zbudpage_spin_lock(zbudpage);
	zbudpage_clear_dying(zbudpage);
	page = zbud_unuse_zbudpage(zbudpage, true);
	if (!irqs_disabled)
		local_bh_enable();
out:
	return page;
}

/*
 * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
 * read the tmem_handle(s) out of it into the passed array, and return the
 * number of zbuds.  Caller must perform necessary tmem functions and,
 * indirectly, zbud functions to fetch any valid data and cause the
 * now-zombified zbudpage to eventually be freed.  We track the zombified
 * zbudpage count so it is possible to observe if there is a leak.
 FIXME: describe (ramster) case where data pointers are passed in for memcpy
 */
unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data,
					unsigned int *zsize, bool eph)
{
	struct zbudpage *zbudpage = NULL, *zbudpag2;
	struct tmem_handle *thfrom;
	char *from_va;
	void *zbpg;
	unsigned size;
	int ret = 0, i;
	spinlock_t *lists_lock =
		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
	struct list_head *lru_list =
		eph ? &zbud_eph_lru_list : &zbud_pers_lru_list;

	spin_lock_bh(lists_lock);
	if (list_empty(lru_list))
		goto out;
	list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) {
		/* skip a locked zbudpage */
		if (unlikely(!zbudpage_spin_trylock(zbudpage)))
			continue;
		/* skip an unevictable zbudpage */
		if (unlikely(zbudpage->unevictable != 0)) {
			zbudpage_spin_unlock(zbudpage);
			continue;
		}
		/* got a locked evictable page */
		goto zombify_page;
	}
	/* no unlocked evictable pages, give up */
	goto out;

zombify_page:
	/* got an unlocked evictable page, zombify it */
	list_del_init(&zbudpage->budlist);
	zbudpage_set_zombie(zbudpage);
	/* FIXME what accounting do I need to do here? */
	list_del_init(&zbudpage->lru);
	if (eph) {
		list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list);
		zbud_eph_zombie_count =
				atomic_inc_return(&zbud_eph_zombie_atomic);
	} else {
		list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list);
		zbud_pers_zombie_count =
				atomic_inc_return(&zbud_pers_zombie_atomic);
	}
	/* FIXME what accounting do I need to do here? */
	zbpg = kmap_zbudpage_atomic(zbudpage);
	for (i = 0; i < 2; i++) {
		size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
		if (size) {
			from_va = zbud_data(zbpg, i, size);
			thfrom = (struct tmem_handle *)from_va;
			from_va += sizeof(struct tmem_handle);
			size -= sizeof(struct tmem_handle);
			if (th != NULL)
				th[ret] = *thfrom;
			if (data != NULL)
				memcpy(data[ret], from_va, size);
			if (zsize != NULL)
				*zsize++ = size;
			ret++;
		}
	}
	kunmap_zbudpage_atomic(zbpg);
	zbudpage_spin_unlock(zbudpage);
out:
	spin_unlock_bh(lists_lock);
	return ret;
}

void zbud_init(void)
{
	int i;

	zbud_debugfs_init();
	BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE));
	BUG_ON(sizeof(struct zbudpage) > sizeof(struct page));
	for (i = 0; i < NCHUNKS; i++) {
		INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list);
		INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list);
	}
}