XV6学习（12）Lab lock: Parallelism/locking

代码在github上

这一次实验是要对XV6内部的锁进行优化，减少锁争用，提高系统的性能。

Memory allocator (moderate)

第一个实验是对XV6内核的内存页面分配器进行改进，改进的策略在前面的章节中也讲过了。XV6原本是使用一个空闲页面链表，但是这样就会导致不同CPU上的kalloc和kfree会产生锁争用，内存页面的分配被完全串行化了，降低了系统的性能。

而一个改进策略就是为每个CPU核心分配一个空闲链表，kalloc和kfree都在本核心的链表上进行，只有当当前核心的链表为空时才去访问其他核心的链表。通过这种策略就可以减少锁的争用，只有当某核心的链表为空时才会发生锁争用。

首先定义NCPU个kmem结构体，并在kinit函数中对锁进行初始化。

struct {

  struct spinlock lock;

  struct run *freelist;

  char lock_name[7];

} kmem[NCPU];

void

kinit()

{

  for (int i = 0; i < NCPU; i++) {

    snprintf(kmem[i].lock_name, sizeof(kmem[i].lock_name), "kmem_%d", i);

    initlock(&kmem[i].lock, kmem[i].lock_name);

  }

  freerange(end, (void*)PHYSTOP);

}

对于kfree函数只需要将释放的页面插入到当前核心对应链表上就行了

void

kfree(void *pa)

{

  ...

  r = (struct run*)pa;

  push_off();

  int id = cpuid();

  acquire(&kmem[id].lock);

  r->next = kmem[id].freelist;

  kmem[id].freelist = r;

  release(&kmem[id].lock);

  pop_off();

}

对于kalloc函数，当在当前核心上申请失败时，就尝试从其他核心上获取页面。使用快慢指针来找到链表的中点，之后将一半的页面移动到当前核心的链表上。

void *

kalloc(void)

{

  struct run *r;

  push_off();

  int id = cpuid();

  acquire(&kmem[id].lock);

  r = kmem[id].freelist;

  if(r) {

    kmem[id].freelist = r->next;

  }

  else {

    // alloc failed, try to steal from other cpu

    int success = 0;

    int i = 0;

    for(i = 0; i < NCPU; i++) {

      if (i == id) continue;

      acquire(&kmem[i].lock);

      struct run *p = kmem[i].freelist;

      if(p) {

        // steal half of memory

        struct run *fp = p; // faster pointer

        struct run *pre = p;

        while (fp && fp->next) {

          fp = fp->next->next;

          pre = p;

          p = p->next;

        }

        kmem[id].freelist = kmem[i].freelist;

        if (p == kmem[i].freelist) {

          // only have one page

          kmem[i].freelist = 0;

        }

        else {

          kmem[i].freelist = p;

          pre->next = 0;

        }

        success = 1;

      }

      release(&kmem[i].lock);

      if (success) {

        r = kmem[id].freelist;

        kmem[id].freelist = r->next;

        break;

      }

    }

  }

  release(&kmem[id].lock);

  pop_off();

  if(r)

    memset((char*)r, 5, PGSIZE); // fill with junk

  return (void*)r;

}

实验结果如下：

$ kalloctest

start test1

test1 results:

--- lock kmem/bcache stats

lock: kmem_0: #fetch-and-add 0 #acquire() 77186

lock: kmem_1: #fetch-and-add 0 #acquire() 182362

lock: kmem_2: #fetch-and-add 0 #acquire() 173534

lock: bcache_bucket: #fetch-and-add 0 #acquire() 128

lock: bcache_bucket: #fetch-and-add 0 #acquire() 138

lock: bcache_bucket: #fetch-and-add 0 #acquire() 142

lock: bcache_bucket: #fetch-and-add 0 #acquire() 148

lock: bcache_bucket: #fetch-and-add 0 #acquire() 132

lock: bcache_bucket: #fetch-and-add 0 #acquire() 6

lock: bcache_bucket: #fetch-and-add 0 #acquire() 42

lock: bcache_bucket: #fetch-and-add 0 #acquire() 34

lock: bcache_bucket: #fetch-and-add 0 #acquire() 5916

lock: bcache_bucket: #fetch-and-add 0 #acquire() 32

lock: bcache_bucket: #fetch-and-add 0 #acquire() 242

lock: bcache_bucket: #fetch-and-add 0 #acquire() 128

lock: bcache_bucket: #fetch-and-add 0 #acquire() 128

--- top 5 contended locks:

lock: proc: #fetch-and-add 31954 #acquire() 206502

lock: proc: #fetch-and-add 24395 #acquire() 206518

lock: proc: #fetch-and-add 9306 #acquire() 206501

lock: proc: #fetch-and-add 7463 #acquire() 206481

lock: proc: #fetch-and-add 5209 #acquire() 206480

tot= 0

test1 OK

start test2

total free number of pages: 32493 (out of 32768)

.....

test2 OK

Buffer cache (hard)

这个实验是要对XV6的磁盘缓冲区进行优化。在初始的XV6磁盘缓冲区中是使用一个LRU链表来维护的，而这就导致了每次获取、释放缓冲区时就要对整个链表加锁，也就是说缓冲区的操作是完全串行进行的。

为了提高并行性能，我们可以用哈希表来代替链表，这样每次获取和释放的时候，都只需要对哈希表的一个桶进行加锁，桶之间的操作就可以并行进行。只有当需要对缓冲区进行驱逐替换时，才需要对整个哈希表加锁来查找要替换的块。

使用哈希表就不能使用链表来维护LRU信息，因此需要在buf结构体中添加timestamp域来记录释放的事件，同时prev域也不再需要。

struct buf {

  int valid;   // has data been read from disk?

  int disk;    // does disk "own" buf?

  uint dev;

  uint blockno;

  struct sleeplock lock;

  uint refcnt;

  // struct buf *prev; // LRU cache list

  struct buf *next;

  uchar data[BSIZE];

  uint timestamp;

};

在brelse函数中对timestamp域进行维护，同时将链表的锁替换为桶级锁：

void

brelse(struct buf *b)

{

  if(!holdingsleep(&b->lock))

    panic("brelse");

  releasesleep(&b->lock);

  int idx = hash(b->dev, b->blockno);

  acquire(&hashtable[idx].lock);

  b->refcnt--;

  if (b->refcnt == 0) {

    // no one is waiting for it.

    b->timestamp = ticks;

  }

  release(&hashtable[idx].lock);

}

定义哈希表的结构体，bcache.lock为表级锁，而hashtable[i].lock为桶级锁：

#define NBUCKET 13

#define NBUF (NBUCKET * 3)

struct {

  struct spinlock lock;

  struct buf buf[NBUF];

} bcache;

struct bucket {

  struct spinlock lock;

  struct buf head;

}hashtable[NBUCKET];

int

hash(uint dev, uint blockno)

{

  return blockno % NBUCKET;

}

在binit函数中对哈希表进行初始化，将bcache.buf[NBUF]中的块平均分配给每个桶，记得设置b->blockno使块的hash与桶相对应，后面需要根据块来查找对应的桶。

void

binit(void)

{

  struct buf *b;

  initlock(&bcache.lock, "bcache");

  for(b = bcache.buf; b < bcache.buf+NBUF; b++){

    initsleeplock(&b->lock, "buffer");

  }

  b = bcache.buf;

  for (int i = 0; i < NBUCKET; i++) {

    initlock(&hashtable[i].lock, "bcache_bucket");

    for (int j = 0; j < NBUF / NBUCKET; j++) {

      b->blockno = i; // hash(b) should equal to i

      b->next = hashtable[i].head.next;

      hashtable[i].head.next = b;

      b++;

    }

  }

}

之后就是重点bget函数，首先在对应的桶当中查找当前块是否被缓存，如果被缓存就直接返回；如果没有被缓存的话，就需要查找一个块并将其逐出替换。我这里使用的策略是先在当前桶当中查找，当前桶没有查找到再去全局数组中查找，这样的话如果当前桶中有空闲块就可以避免全局锁。

在全局数组中查找时，要先加上表级锁，当找到一个块之后，就可以根据块的信息查找到对应的桶，之后再对该桶加锁，将块从桶的链表上取下来，释放锁，最后再加到当前桶的链表上去。

这里有个小问题就是全局数组中找到一个块之后，到对该桶加上锁之间有一个窗口，可能就在这个窗口里面这个块就被那个桶对应的本地查找阶段用掉了。因此，需要在加上锁之后判断是否被用了，如果被用了就要重新查找。

static struct buf*

bget(uint dev, uint blockno)

{

  // printf("dev: %d blockno: %d Status: ", dev, blockno);

  struct buf *b;

  int idx = hash(dev, blockno);

  struct bucket* bucket = hashtable + idx;

  acquire(&bucket->lock);

  // Is the block already cached?

  for(b = bucket->head.next; b != 0; b = b->next){

    if(b->dev == dev && b->blockno == blockno){

      b->refcnt++;

      release(&bucket->lock);

      acquiresleep(&b->lock);

      // printf("Cached %p\n", b);

      return b;

    }

  }

  // Not cached.

  // First try to find in current bucket.

  int min_time = 0x8fffffff;

  struct buf* replace_buf = 0;

  for(b = bucket->head.next; b != 0; b = b->next){

    if(b->refcnt == 0 && b->timestamp < min_time) {

      replace_buf = b;

      min_time = b->timestamp;

    }

  }

  if(replace_buf) {

    // printf("Local %d %p\n", idx, replace_buf);

    goto find;

  }

  // Try to find in other bucket.

  acquire(&bcache.lock);

  refind:

  for(b = bcache.buf; b < bcache.buf + NBUF; b++) {

    if(b->refcnt == 0 && b->timestamp < min_time) {

      replace_buf = b;

      min_time = b->timestamp;

    }

  }

  if (replace_buf) {

    // remove from old bucket

    int ridx = hash(replace_buf->dev, replace_buf->blockno);

    acquire(&hashtable[ridx].lock);

    if(replace_buf->refcnt != 0)  // be used in another bucket's local find between finded and acquire

    {

      release(&hashtable[ridx].lock);

      goto refind;

    }

    struct buf *pre = &hashtable[ridx].head;

    struct buf *p = hashtable[ridx].head.next;

    while (p != replace_buf) {

      pre = pre->next;

      p = p->next;

    }

    pre->next = p->next;

    release(&hashtable[ridx].lock);

    // add to current bucket

    replace_buf->next = hashtable[idx].head.next;

    hashtable[idx].head.next = replace_buf;

    release(&bcache.lock);

    // printf("Global %d -> %d %p\n", ridx, idx, replace_buf);

    goto find;

  }

  else {

    panic("bget: no buffers");

  }

  find:

  replace_buf->dev = dev;

  replace_buf->blockno = blockno;

  replace_buf->valid = 0;

  replace_buf->refcnt = 1;

  release(&bucket->lock);

  acquiresleep(&replace_buf->lock);

  return replace_buf;

}

最后将bpin和bunpin的锁替换为桶级锁就行了：

void

bpin(struct buf *b) {

  int idx = hash(b->dev, b->blockno);

  acquire(&hashtable[idx].lock);

  b->refcnt++;

  release(&hashtable[idx].lock);

}

void

bunpin(struct buf *b) {

  int idx = hash(b->dev, b->blockno);

  acquire(&hashtable[idx].lock);

  b->refcnt--;

  release(&hashtable[idx].lock);

}

实验结果如下：

start test0

test0 results:

--- lock kmem/bcache stats

...

lock: bcache_bucket: #fetch-and-add 0 #acquire() 4244

lock: bcache_bucket: #fetch-and-add 0 #acquire() 2246

lock: bcache_bucket: #fetch-and-add 0 #acquire() 4402

lock: bcache_bucket: #fetch-and-add 0 #acquire() 4458

lock: bcache_bucket: #fetch-and-add 0 #acquire() 6450

lock: bcache_bucket: #fetch-and-add 0 #acquire() 6312

lock: bcache_bucket: #fetch-and-add 0 #acquire() 6624

lock: bcache_bucket: #fetch-and-add 0 #acquire() 6634

lock: bcache_bucket: #fetch-and-add 0 #acquire() 12706

lock: bcache_bucket: #fetch-and-add 0 #acquire() 6208

lock: bcache_bucket: #fetch-and-add 0 #acquire() 4360

lock: bcache_bucket: #fetch-and-add 0 #acquire() 4246

lock: bcache_bucket: #fetch-and-add 0 #acquire() 2236

--- top 5 contended locks:

lock: proc: #fetch-and-add 269741 #acquire() 4551132

lock: proc: #fetch-and-add 236112 #acquire() 4551131

lock: proc: #fetch-and-add 186278 #acquire() 4551151

lock: proc: #fetch-and-add 167286 #acquire() 4551164

lock: proc: #fetch-and-add 151922 #acquire() 4551132

tot= 0

test0: OK

start test1

test1 OK