direct IOカーネルの分析とピットの開示-3.10.0-693.11.1に基づく

70072 ワード

linuxの読み書きシステム呼び出しはO_を提供しますDIRECTタグは、キャッシュを迂回してディスクを直接読み書きすることができます(なぜ迂回してみますか?直接ドロップに失敗した場合、キャッシュを通じてドロップします).ダイレクトドロップを実現するためにはdirect IOの制限が多く、ファイルのオフセットがディスクblockに整列し、メモリアドレスがディスクblockに整列し、読み書きsizeもディスクblockに整列しなければならない.しかしdirect IOの実現には小さな欠陥がある.この欠陥は私のfuseの上の分析はすでにはっきり言って、欠陥の原理に対してはっきりしないで、私のfuseの欠陥の分析に移ることができて、ここで主にdirect IOの実現を分析して、ついでにどこが欠陥を導入したかを言います.
システム呼び出しwriteからdirect IOへの呼び出しパスは
write --> vfs_write --> do_sync_write --> generic_file_aio_write -->__generic_file_aio_write
からgeneric_file_aio_writeはdirect IOのコンテンツをデザインし始めましたが、私たちは_generic_file_aio_write分析開始
/*@iocb do_sync_write      io   ,      io      
 *@iov     write       ,    1   
 *@nr_segs  1
 *@     
 */
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
				 unsigned long nr_segs, loff_t *ppos)
{
	//           
	...
	if (io_is_direct(file)) {
		loff_t endbyte;
		ssize_t written_buffered;
		//          
		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
							ppos, count, ocount);
		/*
		 * If the write stopped short of completing, fall back to
		 * buffered writes.  Some filesystems do this for writes to
		 * holes, for example.  For DAX files, a buffered write will
		 * not succeed (even if it did, DAX does not handle dirty
		 * page-cache pages correctly).
		 */
		if (written < 0 || written == count || IS_DAX(inode))
			goto out;

		pos += written;
		count -= written;
		//               ,             
		written_buffered = generic_file_buffered_write(iocb, iov,
						nr_segs, pos, ppos, count,
						written);
		/*
		 * If generic_file_buffered_write() retuned a synchronous error
		 * then we want to return the number of bytes which were
		 * direct-written, or the error code if that was zero.  Note
		 * that this differs from normal direct-io semantics, which
		 * will return -EFOO even if some bytes were written.
		 */
		if (written_buffered < 0) {
			err = written_buffered;
			goto out;
		}

		/*
		 * We need to ensure that the page cache pages are written to
		 * disk and invalidated to preserve the expected O_DIRECT
		 * semantics.
		 */
		//       ,         ,               
		endbyte = pos + written_buffered - written - 1;
		err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
		...
	} 
	...
}

ファイルが開いているときにO_が付いている場合DIRECTタグを付けると、ディスクに直接書き込んでみますが、エラーではなく直接ディスクを落とすことができなかったら、キャッシュ書きを使って、メモリキャッシュに書き込んでから、キャッシュをディスクに落とすようにしてみます.だからmanマニュアルO_DIRECTというマークの説明は、cacheを使用しないのではなくTry to minimize cache effects of the I/O to and from this file.です.
ssize_t
generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
		size_t count, size_t ocount)
{
	...
	//                      
	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
	if (written)
		goto out;

	/*
	 * After a write we want buffered reads to be sure to go to disk to get
	 * the new data.  We invalidate clean cached page from the region we're
	 * about to write.  We do this *before* the write so that we can return
	 * without clobbering -EIOCBQUEUED from ->direct_IO().
	 */
	if (mapping->nrpages) {
		//        ,               ,            ,             ?        ,       
		written = invalidate_inode_pages2_range(mapping,
					pos >> PAGE_CACHE_SHIFT, end);
		/*
		 * If a page can not be invalidated, return 0 to fall back
		 * to buffered write.
		 */
		if (written) {
			if (written == -EBUSY)
				return 0;
			goto out;
		}
	}

	written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);

	/*
	 * Finally, try again to invalidate clean pages which might have been
	 * cached by non-direct readahead, or faulted in by get_user_pages()
	 * if the source of the write was an mmap'ed region of the file
	 * we're writing.  Either one is a pretty crazy thing to do,
	 * so we don't support it 100%.  If this invalidation
	 * fails, tough, the write still worked...
	 */
	 //                  ?         
	if (mapping->nrpages) {
		invalidate_inode_pages2_range(mapping,
					      pos >> PAGE_CACHE_SHIFT, end);
	}
	...
}

generic_file_direct_writeはまず書く場所のメモリキャッシュを先にディスクをブラシしてから、対応するキャッシュを失効させ、directが書き終わったらページを失効させます.これはなぜですか.私たちはこのような状況を考えています.
プロセスはmmapでファイルの空間を1ページでマッピングし、そのメモリをwriteに渡してdirect ioで書き、書いたアドレスはちょうどmmapマッピングのブロックにある.
direct ioが書き終わったら、その範囲のページキャッシュを無効にしませんか?この时、上の英語の注釈で言った原因の一つは、他の原因がまだよく分かりません.ここでmmap実現に不慣れなのは私のmmap実現の分析を見ることができる.
generic_file_direct_writeはファイルシステムが提供するdirect IO関数を呼び出したが、ファイルシステムの他の関数のように、ほとんどのファイルシステムの実装はカプセル化されたカーネルのdirect io関数であり、ext 2を例にとると、最終的に呼び出されたのはdo_である.blockdev_direct_IO、この関数は読み書きが通用するので、分析するときは注意してください.
 /*
@rw 	    
@iocb 	  io   ,         offset,     
@inode 	  inode
@bdev 	    bdev
@iov 	    iov,    1 
@offset   offset
@nr_segs 	   1
@get_block 	      block    block    
@end_io 	io        ,   NULL
@submit_io 	     ,   NULL
@flags 		  ,   DIO_LOCKING | DIO_SKIP_HOLES
*/
static inline ssize_t
do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
	dio_submit_t submit_io,	int flags)
{
	int seg;
	size_t size;
	unsigned long addr;
	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
	unsigned blkbits = i_blkbits;
	unsigned blocksize_mask = (1 << blkbits) - 1;
	ssize_t retval = -EINVAL;
	loff_t end = offset;
	struct dio *dio;
	struct dio_submit sdio = { 0, };
	unsigned long user_addr;
	size_t bytes;
	struct buffer_head map_bh = { 0, };
	struct blk_plug plug;

	if (rw & WRITE)
		rw = WRITE_ODIRECT;

	/*
	 * Avoid references to bdev if not absolutely needed to give
	 * the early prefetch in the caller enough time.
	 */

	//              inode block          block  
	if (offset & blocksize_mask) {
		if (bdev)
			blkbits = blksize_bits(bdev_logical_block_size(bdev));
		blocksize_mask = (1 << blkbits) - 1;
		if (offset & blocksize_mask)
			goto out;
	}

	/* Check the memory alignment.  Blocks cannot straddle pages */
	//  iov    size     inode block          block  
	//      ?      io          block,        2 n   block,
	//         block   ,block      。
	for (seg = 0; seg < nr_segs; seg++) {
		addr = (unsigned long)iov[seg].iov_base;
		size = iov[seg].iov_len;
		end += size;
		if (unlikely((addr & blocksize_mask) ||
			     (size & blocksize_mask))) {
			if (bdev)
				blkbits = blksize_bits(
					 bdev_logical_block_size(bdev));
			blocksize_mask = (1 << blkbits) - 1;
			if ((addr & blocksize_mask) || (size & blocksize_mask))
				goto out;
		}
	}

	/* watch out for a 0 len io from a tricksy fs */
	//       0,             
	if (rw == READ && end == offset)
		return 0;

	dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
	retval = -ENOMEM;
	if (!dio)
		goto out;
	/*
	 * Believe it or not, zeroing out the page array caused a .5%
	 * performance regression in a database benchmark.  So, we take
	 * care to only zero out what's needed.
	 */
	memset(dio, 0, offsetof(struct dio, pages));

	dio->flags = flags;
	if (dio->flags & DIO_LOCKING) {	//direct IO       ,         
		if (rw == READ) {
			struct address_space *mapping =
					iocb->ki_filp->f_mapping;

			/* will be released by direct_io_worker */
			mutex_lock(&inode->i_mutex);	//   inode  ,    read         ,

			//                    ,      ,write         ,    read
			retval = filemap_write_and_wait_range(mapping, offset,
							      end - 1);
			if (retval) {
				mutex_unlock(&inode->i_mutex);
				kmem_cache_free(dio_cache, dio);
				goto out;
			}
		}
	}

	/* Once we sampled i_size check for reads beyond EOF */
	//           ,         0
	dio->i_size = i_size_read(inode);
	if (rw == READ && offset >= dio->i_size) {
		if (dio->flags & DIO_LOCKING)
			mutex_unlock(&inode->i_mutex);
		kmem_cache_free(dio_cache, dio);
		retval = 0;
		goto out;
	}

	/*
	 * For file extending writes updating i_size before data writeouts
	 * complete can expose uninitialized blocks in dumb filesystems.
	 * In that case we need to wait for I/O completion even if asked
	 * for an asynchronous write.
	 */
	if (is_sync_kiocb(iocb)) //      , do_sync_write/do_sync_read  kiocb     
		dio->is_async = false;
	else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
            (rw & WRITE) && end > i_size_read(inode))
		dio->is_async = false;
	else
		dio->is_async = true;

	dio->inode = inode;
	dio->rw = rw;

	/*
	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
	 * so that we can call ->fsync.
	 */
	//        ,dio->is_async = false
	if ((dio->inode->i_sb->s_type->fs_flags & FS_HAS_DIO_IODONE2) &&
	    dio->is_async && (rw & WRITE) &&
	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
		retval = dio_set_defer_completion(dio);
		if (retval) {
			/*
			 * We grab i_mutex only for reads so we don't have
			 * to release it here
			 */
			kmem_cache_free(dio_cache, dio);
			goto out;
		}
	}

	/*
	 * Will be decremented at I/O completion time.
	 */
	//     ,   inode_dio_begin
	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
		inode_dio_begin(inode);

	retval = 0;
	sdio.blkbits = blkbits;	//blkbits         block    inode block  ,           
	//inode block,    。
	sdio.blkfactor = i_blkbits - blkbits;	//    0
	sdio.block_in_file = offset >> blkbits;	//            

	sdio.get_block = get_block;
	dio->end_io = end_io;
	sdio.submit_io = submit_io;
	sdio.final_block_in_bio = -1;
	sdio.next_block_for_io = -1;

	dio->iocb = iocb;

	spin_lock_init(&dio->bio_lock);
	dio->refcount = 1;

	/*
	 * In case of non-aligned buffers, we may need 2 more
	 * pages since we need to zero out first and last block.
	 */
	//    page_in_io                      
	if (unlikely(sdio.blkfactor))
		sdio.pages_in_io = 2;

	for (seg = 0; seg < nr_segs; seg++) {
		user_addr = (unsigned long)iov[seg].iov_base;
		sdio.pages_in_io +=
			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
				PAGE_SIZE - user_addr / PAGE_SIZE);
	}

	blk_start_plug(&plug);

	for (seg = 0; seg < nr_segs; seg++) {
		user_addr = (unsigned long)iov[seg].iov_base;
		sdio.size += bytes = iov[seg].iov_len;

		/* Index into the first page of the first block */
		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; //         seg iov             
		sdio.final_block_in_request = sdio.block_in_file +
						(bytes >> blkbits); //         seg iov                      ,sdio.block_in_file    do_direct_IO    ,           
		/* Page fetching state */
		sdio.head = 0;
		sdio.tail = 0;
		sdio.curr_page = 0;

		sdio.total_pages = 0;
		if (user_addr & (PAGE_SIZE-1)) {
			sdio.total_pages++;
			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
		}
		sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;  //               ,                       
		sdio.curr_user_address = user_addr;

		retval = do_direct_IO(dio, &sdio, &map_bh);//    ,       

		dio->result += iov[seg].iov_len -
			((sdio.final_block_in_request - sdio.block_in_file) << //   sdio.final_block_in_request = sdio.block_in_file +(bytes >> blkbits),               ,    dio->result 。
					blkbits);

		if (retval) {
			dio_cleanup(dio, &sdio);
			break;
		}
	} /* end iovec loop */

	if (retval == -ENOTBLK) {
		/*
		 * The remaining part of the request will be
		 * be handled by buffered I/O when we return
		 */
		retval = 0;
	}
	/*
	 * There may be some unwritten disk at the end of a part-written
	 * fs-block-sized block.  Go zero that now.
	 */
	//          ,               iblock   ,get_block_t      i
	//block          block  (  iblock      block),        block 0
	dio_zero_block(dio, &sdio, 1, &map_bh);

	if (sdio.cur_page) {
		ssize_t ret2;

		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
		if (retval == 0)
			retval = ret2;
		page_cache_release(sdio.cur_page);
		sdio.cur_page = NULL;
	}
	if (sdio.bio)
		dio_bio_submit(dio, &sdio);

	blk_finish_plug(&plug);

	/*
	 * It is possible that, we return short IO due to end of file.
	 * In that case, we need to release all the pages we got hold on.
	 */
	dio_cleanup(dio, &sdio);

	/*
	 * All block lookups have been performed. For READ requests
	 * we can let i_mutex go now that its achieved its purpose
	 * of protecting us from looking up uninitialized blocks.
	 */
	if (rw == READ && (dio->flags & DIO_LOCKING))
		mutex_unlock(&dio->inode->i_mutex);

	/*
	 * The only time we want to leave bios in flight is when a successful
	 * partial aio read or full aio write have been setup.  In that case
	 * bio completion will call aio_complete.  The only time it's safe to
	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
	 * This had *better* be the only place that raises -EIOCBQUEUED.
	 */
	//       bio,           
	BUG_ON(retval == -EIOCBQUEUED);
	if (dio->is_async && retval == 0 && dio->result &&
	    ((rw == READ) || (dio->result == sdio.size)))
		retval = -EIOCBQUEUED;

	if (retval != -EIOCBQUEUED)
		dio_await_completion(dio);	//    IO      

	if (drop_refcount(dio) == 0) {
		retval = dio_complete(dio, offset, retval, false);
	} else
		BUG_ON(retval != -EIOCBQUEUED);

out:
	return retval;
}

do_blockdev_direct_IOという関数は長いが、内容は多くなく、主に連続するユーザ空間ごとにsdioとdioを初期化してdo_を呼び出す.direct_IO.システム呼び出しのiovは、いずれも1つの要素しかありません.sdioとdioはdirect IOに特化しているように見え、ユーザーページ、ブロックマッピング、bioの配布を維持しています.do_を分析しますdirect_IO
/*
 * Walk the user pages, and the file, mapping blocks to disk and generating
 * a sequence of (page,offset,len,block) mappings.  These mappings are injected
 * into submit_page_section(), which takes care of the next stage of submission
 *
 * Direct IO against a blockdev is different from a file.  Because we can
 * happily perform page-sized but 512-byte aligned IOs.  It is important that
 * blockdev IO be able to have fine alignment and large sizes.
 *
 * So what we do is to permit the ->get_block function to populate bh.b_size
 * with the size of IO which is permitted at this offset and this i_blkbits.
 *
 * For best results, the blockdev should be set up with 512-byte i_blkbits and
 * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
 * fine alignment but still allows this function to work in PAGE_SIZE units.
 */
 //               ,        ,     page,offset,len,block   , buffer_head,       。
static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
			struct buffer_head *map_bh)
{
	const unsigned blkbits = sdio->blkbits;
	const unsigned blocks_per_page = PAGE_SIZE >> blkbits; //    ,    block   ,      block
	struct page *page;
	unsigned block_in_page;
	int ret = 0;

	/* The I/O can start at any block offset within the first page */
	//sdio->first_block_in_page                   
	//  IO            ,      while  
	block_in_page = sdio->first_block_in_page; 

	while (sdio->block_in_file < sdio->final_block_in_request) { // iov               
		//        ,                      。         
		page = dio_get_page(dio, sdio);
		if (IS_ERR(page)) {
			ret = PTR_ERR(page);
			goto out;
		}

		while (block_in_page < blocks_per_page) { //            block
			...
			//          ,     ,       ,            submit_page_section    bio
			ret = submit_page_section(dio, sdio, page,
						  offset_in_page,
						  this_chunk_bytes,
						  sdio->next_block_for_io,
						  map_bh);
			...
		}

		/* Drop the ref which was taken in get_user_pages() */
		page_cache_release(page);
		block_in_page = 0;	//     ,        0
	}
out:
	return ret;
}

ここでは主にdio_を分析するget_page、ここでbio用のページを取得しますが、このページはどこから来ますか?以前はbuffer IOコードを読んでいて悩んでいましたが、buffer IOのbioのページはファイルのaddress_スペースが来て、これは自然で、直接IOはbufferにすぎないので、申請の臨時ページであれば、効率は低くなります.
/*
 * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
 * buffered inside the dio so that we can call get_user_pages() against a
 * decent number of pages, less frequently.  To provide nicer use of the
 * L1 cache.
 */
 //             ,              ,   sdio ,        
static inline struct page *dio_get_page(struct dio *dio,
		struct dio_submit *sdio)
{
	if (dio_pages_present(sdio) == 0) {//  sdio        ,   0  sdio     
		int ret;

		ret = dio_refill_pages(dio, sdio);//  dio_refill_pages sdio       
		if (ret)
			return ERR_PTR(ret);
		BUG_ON(dio_pages_present(sdio) == 0);
	}
	return dio->pages[sdio->head++];//         
}

/*
 * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
 */
static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
	int ret;
	int nr_pages;

	nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); //         
	ret = get_user_pages_fast(	//                ,      mm 
		sdio->curr_user_address,		/* Where from? */
		nr_pages,			/* How many pages? */
		dio->rw == READ,		/* Write to memory? */
		&dio->pages[0]);		/* Put results here */

	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
		struct page *page = ZERO_PAGE(0);
		/*
		 * A memory fault, but the filesystem has some outstanding
		 * mapped blocks.  We need to use those blocks up to avoid
		 * leaking stale data in the file.
		 */
		//   direct    ,                 ,                          。         0 。
		if (dio->page_errors == 0)
			dio->page_errors = ret;
		page_cache_get(page);
		dio->pages[0] = page;
		sdio->head = 0;
		sdio->tail = 1;
		ret = 0;
		goto out;
	}

	if (ret >= 0) {
		sdio->curr_user_address += ret * PAGE_SIZE;	//sdio->curr_user_address         iov        
		sdio->curr_page += ret; //sdio->curr_page         iov       
		sdio->head = 0;
		sdio->tail = ret;
		ret = 0;
	}
out:
	return ret;	
}

dio_が見えますget_pageは最終的にget_を通過しますuser_pages_fastは、ユーザが提供するキャッシュが存在するページを取得してdirect IOを行い、メモリコピーを1回行うだけで直接ディスクを落とすことができます.
ここではユーザーのページを直接使用すると、メリットがあり、存在する可能性のある大きな穴も導入されています.direct readが行われている場合、ここではdio_get_pageがページを取得した後、bioが発行される前に、プロセスはcowをトリガーしてページテーブルを置き換えて、これは気まずいです.へへ.どうやってこの大きな穴を避けますか?3つの方法があります.1つは、マルチスレッドプロセスのすべてのメモリ申請をページに整列させることです.二つ目は、システム、popenなど、マルチスレッド環境でforkを使用しないでください.3つ目は、マルチスレッド環境でdirect ioを読まないことです.
direct IOからsubmit_page_sectionの下に、BIO層に着いて、何の研究もなくて、これで終わります.