PostgreSQL源码分析——SeqScan

这里，我们分析一下顺序扫描的过程，以最简单的select * from t1;语句为例，分析一下查询的过程。

postgres@postgres=# \d t1;Table "public.t1"Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------a      | integer |           | not null | b      | integer |           |          | 
Indexes:"t1_pkey" PRIMARY KEY, btree (a)-- 顺序扫描
postgres@postgres=# explain select * from t1;QUERY PLAN                      
------------------------------------------------------Seq Scan on t1  (cost=0.00..15.00 rows=1000 width=8)
(1 row)

语法解析以及语义分析

这部分前面已有分析过，这里简要描述：

解析层面流程如下：

exec_simple_query
--> pg_parse_query              // 生成抽象语法树--> raw_parser--> base_yyparse
--> pg_analyze_and_rewrite      // 语义分析，转为查询树Query

语法表示可查看gram.y中的定义。

simple_select:SELECT opt_all_clause opt_target_list   // select *into_clause from_clause where_clause    // from t1;group_clause having_clause window_clause{SelectStmt *n = makeNode(SelectStmt);// ......$$ = (Node *)n;}
from_clause:FROM from_list							{ $$ = $2; }| /*EMPTY*/								{ $$ = NIL; };from_list:table_ref								{ $$ = list_make1($1); }| from_list ',' table_ref				{ $$ = lappend($1, $3); };table_ref:	relation_expr opt_alias_clause{$1->alias = $2;$$ = (Node *) $1;}

关键数据结构：

// select语句的抽象语法树表示
typedef struct SelectStmt
{NodeTag		type;// 对应select *List	   *targetList;		/* the target list (of ResTarget) */ // 对应         from t1List	   *fromClause;		/* the FROM clause */  // 保存RangeVar节点// ...... } SelectStmt;/** RangeVar - range variable, used in FROM clauses** Also used to represent table names in utility statements; there, the alias* field is not used, and inh tells whether to apply the operation* recursively to child tables.  In some contexts it is also useful to carry* a TEMP table indication here.*/
typedef struct RangeVar
{NodeTag		type;char	   *catalogname;	/* the catalog (database) name, or NULL */char	   *schemaname;		/* the schema name, or NULL */char	   *relname;		/* the relation/sequence name */bool		inh;			/* expand rel by inheritance? recursively act* on children? */char		relpersistence; /* see RELPERSISTENCE_* in pg_class.h */Alias	   *alias;			/* table alias & optional column aliases */int			location;		/* token location, or -1 if unknown */
} RangeVar;    // 存储表的所在库名，模式名，表名信息语义分析流程：
```c
pg_analyze_and_rewrite   // 语义分析，生成查询树Query
--> pg_analyze--> transformStmt--> transformSelectStmt--> transformFromClause  // 处理表， 将RangeVar转为RangeTblEntry--> transformFromClauseItem--> transformTableEntry     --> transformTargetList     // 处理 * ，展开为a, b
--> pg_rewrite_query

查询优化

主要是生成执行计划，前面已经通过EXPLAIN命令得知其执行计划为顺序扫描，让我们具体看一下其是如何生成的。

pg_plan_queries
--> pg_plan_query--> planner--> standard_planner--> subquery_planner--> grouping_planner--> query_planner 		// Generate a path (that is, a simplified plan) for a basic query--> setup_simple_rel_arrays--> add_base_rels_to_query--> build_simple_rel--> make_one_rel	// Finds all possible access paths for executing a query--> set_base_rel_sizes--> set_rel_size--> set_plain_rel_size--> set_baserel_size_estimates	// 选择率计算，计算代价Cost要用--> set_base_rel_pathlists--> set_rel_pathlist // Build access paths for a base relation--> set_values_pathlist--> create_seqscan_path // 生成顺序扫描路径--> make_rel_from_joinlist--> apply_scanjoin_target_to_paths--> create_plan--> create_scan_plan--> create_seqscan_pla

这里有个函数要重点说明一下，在涉及多表路径规划时，单表路径是最基础的，单表路径可以是顺序扫描、索引扫描、TID扫描等，这个是最底层的。

/* Build access paths for a plain relation (no subquery, no inheritance) */
void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
{Relids		required_outer;/* We don't support pushing join clauses into the quals of a seqscan, but* it could still have required parameterization due to LATERAL refs in its tlist. */required_outer = rel->lateral_relids;/* Consider sequential scan */add_path(rel, create_seqscan_path(root, rel, required_outer, 0));	// 顺序扫描/* If appropriate, consider parallel sequential scan */if (rel->consider_parallel && required_outer == NULL)		// 尝试并行顺序扫描create_plain_partial_paths(root, rel);/* Consider index scans */create_index_paths(root, rel);			// 索引扫描/* Consider TID scans */create_tidscan_paths(root, rel);      // TID扫描
}

最后由最佳路径Path生成顺序扫描执行计划SeqScan：

/* ==========* Scan nodes* ========== */
typedef struct Scan
{Plan		plan;// 指明扫描那个表，Index		scanrelid;		/* relid is index into the range table */
} Scan;/* ----------------*		sequential scan node* ---------------- */
typedef Scan SeqScan;

执行器

输入执行计划，输出最终结果。执行顺序扫描。

主流程

PortalStart
--> ExecutorStart--> InitPlan--> ExecInitSeqScan
PortalRun
--> ExecutorRun--> ExecutePlan--> ExecSeqScan --> ExecScan--> ExecScanFetch--> SeqNext --> table_beginscan  // 表扫描函数scan_begin--> heap_beginscan  // 堆表beginscan--> initscan--> table_scan_getnextslot--> heap_getnextslot--> heapgettup_pagemode--> heapgetpage--> ReadBuffer_common--> ExecStoreBufferHeapTuple--> tts_buffer_heap_store_tuple
PortalDrop

具体实现代码如下：

/* ----------------------------------------------------------------*		ExecSeqScan(node)**		Scans the relation sequentially and returns the next qualifying*		tuple.*		We call the ExecScan() routine and pass it the appropriate*		access method functions.* ---------------------------------------------------------------- */
static TupleTableSlot *
ExecSeqScan(PlanState *pstate)
{SeqScanState *node = castNode(SeqScanState, pstate);return ExecScan(&node->ss,(ExecScanAccessMtd) SeqNext,(ExecScanRecheckMtd) SeqRecheck);
}/** ExecScanFetch -- check interrupts & fetch next potential tuple** This routine is concerned with substituting a test tuple if we are* inside an EvalPlanQual recheck.  If we aren't, just execute* the access method's next-tuple routine.*/
static inline TupleTableSlot *ExecScanFetch(ScanState *node, ExecScanAccessMtd accessMtd, ExecScanRecheckMtd recheckMtd)
{EState	   *estate = node->ps.state;if (estate->es_epq_active != NULL){// ...}/* Run the node-type-specific access method function to get the next tuple */return (*accessMtd) (node);
}/* ----------------------------------------------------------------*		SeqNext**		This is a workhorse for ExecSeqScan* ----------------------------------------------------------------*/
static TupleTableSlot *SeqNext(SeqScanState *node)
{TableScanDesc scandesc;EState	   *estate;ScanDirection direction;TupleTableSlot *slot;/* get information from the estate and scan state */scandesc = node->ss.ss_currentScanDesc;estate = node->ss.ps.state;direction = estate->es_direction;slot = node->ss.ss_ScanTupleSlot;if (scandesc == NULL){/* We reach here if the scan is not parallel, or if we're serially* executing a scan that was planned to be parallel. */scandesc = table_beginscan(node->ss.ss_currentRelation, estate->es_snapshot, 0, NULL);node->ss.ss_currentScanDesc = scandesc;}/* get the next tuple from the table */if (table_scan_getnextslot(scandesc, direction, slot))return slot;return NULL;
}/** Return next tuple from `scan`, store in slot.*/
static inline bool
table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);	// heap_getnextslot
}

堆表扫描

我们知道在PG中，表的存储是被切成很多页（也可以叫区块）的，默认是8k的大小（引申问题：为什么是8k呢？这个页的大小涉及到性能，涉及到很多方面，比如VM，FSM，页缓存，磁盘读取等）。全表扫描就是从表的第0块开始，一直扫描到最后一块。扫描的时候，先指定要扫描的块号，查找块是否在缓冲区中，如果不在缓冲区中，则从磁盘读取该表指定的块到缓存区中。块扫描完成后，加1，继续扫描下一个块，直到扫描结束。如果表很大，缓冲区不能完全缓冲，则会依据时钟扫描算法，选择淘汰掉某些块。

/** Descriptor for heap table scans.*/
typedef struct HeapScanDescData
{TableScanDescData rs_base;	/* AM independent part of the descriptor *//* state set up at initscan time */BlockNumber rs_nblocks;		/* total number of blocks in rel */BlockNumber rs_startblock;	/* block # to start at */BlockNumber rs_numblocks;	/* max number of blocks to scan *//* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" *//* scan current state */bool		rs_inited;		/* false = scan not init'd yet */BlockNumber rs_cblock;		/* current block # in scan, if any */Buffer		rs_cbuf;		/* current buffer in scan, if any *//* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer *//* rs_numblocks is usually InvalidBlockNumber, meaning "scan whole rel" */BufferAccessStrategy rs_strategy;	/* access strategy for reads */HeapTupleData rs_ctup;		/* current tuple in scan, if any *//* these fields only used in page-at-a-time mode and for bitmap scans */int			rs_cindex;		/* current tuple's index in vistuples */int			rs_ntuples;		/* number of visible tuples on page */OffsetNumber rs_vistuples[MaxHeapTuplesPerPage];	/* their offsets */
}			HeapScanDescData;
typedef struct HeapScanDescData *HeapScanDesc;

更详细的代码要阅读heapam.c中的实现。

/** IDENTIFICATION*	  src/backend/access/heap/heapam.c** INTERFACE ROUTINES*		heap_beginscan	- begin relation scan*		heap_rescan		- restart a relation scan*		heap_endscan	- end relation scan*		heap_getnext	- retrieve next tuple in scan*		heap_fetch		- retrieve tuple with given tid*		heap_insert		- insert tuple into a relation*		heap_multi_insert - insert multiple tuples into a relation*		heap_delete		- delete a tuple from a relation*		heap_update		- replace a tuple in a relation with another tuple*/

bool
heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{HeapScanDesc scan = (HeapScanDesc) sscan;/* Note: no locking manipulations needed */if (sscan->rs_flags & SO_ALLOW_PAGEMODE)heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);elseheapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);if (scan->rs_ctup.t_data == NULL){ExecClearTuple(slot);return false;}/** if we get here it means we have a new current scan tuple, so point to* the proper return buffer and return the tuple.*/pgstat_count_heap_getnext(scan->rs_base.rs_rd);ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,scan->rs_cbuf);return true;
}