Linux readシステム呼び出しのpage_cache_sync_readahead()

8946 ワード

1 page_cache_sync_readahead()

前の記事ではdo_generic_file_read()関数のlabel find_Pageでpageを呼び出すcache_sync_readahead()同期プリフェッチを開始します.

void page_cache_sync_readahead(struct address_space *mapping,
			       struct file_ra_state *ra, struct file *filp,
			       pgoff_t offset, unsigned long req_size)
{
	/* no read-ahead */
	if (!ra->ra_pages)
		return;

	/* be dumb */
	if (filp && (filp->f_mode & FMODE_RANDOM)) {
		force_page_cache_readahead(mapping, filp, offset, req_size);
		return;
	}

	/* do read-ahead */
	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}

プリフェッチの最大ページ数が0の場合、プリフェッチはありません.

ファイルを開く方法がランダムであればforce_を呼び出すpage_cache_readahead() .

ondemand_を呼び出すreadahead()プリフェッチを開始します.

2 ondemand_readahead()

static unsigned long
ondemand_readahead(struct address_space *mapping,
		   struct file_ra_state *ra, struct file *filp,
		   bool hit_readahead_marker, pgoff_t offset,
		   unsigned long req_size)
{
	unsigned long max = max_sane_readahead(ra->ra_pages);
	pgoff_t prev_offset;

	/*
	 * start of file
	 */
	if (!offset)
		goto initial_readahead;

	/*
	 * It's the expected callback offset, assume sequential access.
	 * Ramp up sizes, and push forward the readahead window.
	 */
	if ((offset == (ra->start + ra->size - ra->async_size) ||
	     offset == (ra->start + ra->size))) {
		ra->start += ra->size;
		ra->size = get_next_ra_size(ra, max);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
	if (hit_readahead_marker) {
		pgoff_t start;

		rcu_read_lock();
		start = page_cache_next_hole(mapping, offset + 1, max);
		rcu_read_unlock();

		if (!start || start - offset > max)
			return 0;

		ra->start = start;
		ra->size = start - offset;	/* old async_size */
		ra->size += req_size;
		ra->size = get_next_ra_size(ra, max);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * oversize read
	 */
	if (req_size > max)
		goto initial_readahead;

	/*
	 * sequential cache miss
	 * trivial case: (offset - prev_offset) == 1
	 * unaligned reads: (offset - prev_offset) == 0
	 */
	prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
	if (offset - prev_offset <= 1UL)
		goto initial_readahead;

	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
	if (try_context_readahead(mapping, ra, offset, req_size, max))
		goto readit;

	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
	return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);

initial_readahead:
	ra->start = offset;
	ra->size = get_init_ra_size(req_size, max);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
	/*
	 * Will this read hit the readahead marker made by itself?
	 * If so, trigger the readahead marker hit now, and merge
	 * the resulted next readahead window into the current one.
	 */
	if (offset == ra->start && ra->size == ra->async_size) {
		ra->async_size = get_next_ra_size(ra, max);
		ra->size += ra->async_size;
	}

	return ra_submit(ra, mapping, filp);
}

具体的なコードは分析しないで、主な論理を話します:

は、まず、ファイルヘッダからの読み出しであれば、順次読み出しであると判断し、プリフェッチ情報を初期化する.デフォルト設定は4つのpageでプリフェッチされます.

ファイルヘッダからの読み取りでなければ連続した読み取り要求であるか否かを判断し、もしそうであればプリフェッチ数を拡大し、一般的に前回プリフェッチ数に等しい×2.

そうでなければランダムな読み取りであり、プリフェッチは適用されずsys_のみが読み出されるreadリクエストの数.

呼び出しra_submit()はリードリクエストを送信します.

3 ra_submit()

static inline unsigned long ra_submit(struct file_ra_state *ra,
		struct address_space *mapping, struct file *filp)
{
	return __do_page_cache_readahead(mapping, filp,
					ra->start, ra->size, ra->async_size);
}

ra_submit()関数は対_にすぎませんdo_page_cache_readahead()はパッケージを作ります.

4 __do_page_cache_readahead()

int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
			pgoff_t offset, unsigned long nr_to_read,
			unsigned long lookahead_size)
{
	struct inode *inode = mapping->host;
	struct page *page;
	unsigned long end_index;	/* The last page we want to read */
	LIST_HEAD(page_pool);
	int page_idx;
	int ret = 0;
	loff_t isize = i_size_read(inode);

	if (isize == 0)
		goto out;

	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);

	/*
	 * Preallocate as many pages as we will need.
	 */
	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
		pgoff_t page_offset = offset + page_idx;

		if (page_offset > end_index)
			break;

		rcu_read_lock();
		page = radix_tree_lookup(&mapping->page_tree, page_offset);
		rcu_read_unlock();
		if (page && !radix_tree_exceptional_entry(page))
			continue;

		page = page_cache_alloc_readahead(mapping);
		if (!page)
			break;
		page->index = page_offset;
		list_add(&page->lru, &page_pool);
		if (page_idx == nr_to_read - lookahead_size)
			SetPageReadahead(page);
		ret++;
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
	if (ret)
		read_pages(mapping, filp, &page_pool, ret);
	BUG_ON(!list_empty(&page_pool));
out:
	return ret;
}

メモリを割り当てる前に、他のプロセスがメモリに一部のページを読み込んだ可能性があるため、まずページがcacheにあるかどうかを確認します.

ページcacheにない場合、メモリページが割り当てられ、ページプール(page_pool)にページが追加される.

nr_に割り当てられたときto_read - lookahead_size個のページがある場合は、そのページフラグPG_を設定するreadaheadは、次の非同期プリフェッチを行うようにします.

ページ準備完了read_を呼び出すpagesはファイルデータを読み出します.

5 read_pages()

static int read_pages(struct address_space *mapping, struct file *filp,
		struct list_head *pages, unsigned nr_pages)
{
	struct blk_plug plug;
	unsigned page_idx;
	int ret;

	blk_start_plug(&plug);		//block/blk-core.c/line3033

	if (mapping->a_ops->readpages) {
		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);	//fs/ext3/inode.c/line1934
		/* Clean up the remaining pages */
		put_pages_list(pages);
		goto out;
	}

	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
		struct page *page = list_to_page(pages);
		list_del(&page->lru);
		if (!add_to_page_cache_lru(page, mapping,
					page->index, GFP_KERNEL)) {
			mapping->a_ops->readpage(filp, page);
		}
		page_cache_release(page);
	}
	ret = 0;

out:
	blk_finish_plug(&plug);		//block/blk-core.c/line3196

	return ret;
}

blk_start_plug()plug->listを初期化し、plug->listをtask_に挿入struct.

は、それぞれのファイルシステムによって実装されるreadpagesメソッドを呼び出す.

blk_を呼び出すfinish_plug()plug->list flushを次のレイヤに移動し、task_を空にします.structのplug->list.

6 ext3_readpages()

static int
ext3_readpages(struct file *file, struct address_space *mapping,
		struct list_head *pages, unsigned nr_pages)
{
	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);	//fs/mpage.c/line356
}

ext3_readpages()関数対mpage_readpages()は、最後のパラメータがext 3_であることに注意してカプセル化されています.get_blockは関数ポインタです.

ext3_get_block関数は、ディスク上のページのblock番号を計算します.

7 mpage_readpages()

int
mpage_readpages(struct address_space *mapping, struct list_head *pages,
				unsigned nr_pages, get_block_t get_block)
{
	struct bio *bio = NULL;
	unsigned page_idx;
	sector_t last_block_in_bio = 0;
	struct buffer_head map_bh;
	unsigned long first_logical_block = 0;

	map_bh.b_state = 0;
	map_bh.b_size = 0;
	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
		struct page *page = list_entry(pages->prev, struct page, lru);

		prefetchw(&page->flags);
		list_del(&page->lru);
		if (!add_to_page_cache_lru(page, mapping,
					page->index, GFP_KERNEL)) {
			bio = do_mpage_readpage(bio, page,
					nr_pages - page_idx,
					&last_block_in_bio, &map_bh,
					&first_logical_block,
					get_block);
		}
		page_cache_release(page);
	}
	BUG_ON(!list_empty(pages));
	if (bio)
		mpage_bio_submit(READ, bio);
	return 0;
}

ループ呼び出しdo_mpage_readpageは、必要なページをbioデータ構造に変換し、ページの内容がディスク上で連続している場合は、1つのbioのみを使用します.連続しない場合は、複数のbioを使用します.bioは単一チェーンテーブルです.

呼び出しmpage_bio_submit()はbio submitになります.

8 mpage_bio_submit()

static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
	bio->bi_end_io = mpage_end_io;
	guard_bio_eod(rw, bio);
	submit_bio(rw, bio);		//block/blk-core.c/line1965
	return NULL;
}

bio完了メソッドをmpage_に設定end_ioは、io操作が終了した後に呼び出され、対応するページフラグをセットします.

bio submitをgeneric block layerに送信します.

Lazy Evaluation

オペレーティングシステムの理解]プロセスとLinuxスケジューリング(CFS)