本节通过一个具体的例子来分析SQLite原子提交的实现(基于Version 3.3.6的代码)。 CREATE TABLE episodes( id integer primary key,name text, cid int) ; 插入一条记录:insert into episodes(name,cid) values(“cat”,1) ; 它经过编译器处理后生成的虚拟机代码如下:

    在SQLite写数据库之前,它必须先从数据库中读取相关信息。比如,在插入新的数据时,SQLite会先从sqlite_master表中读取数据库模式(相当于数据字典),以便编译器对INSERT语句进行分析,确定数据插入的位置。 在进行读操作之前,必须先获取数据库的共享锁(shared lock),共享锁允许两个或更多的连接在同一时刻读取数据库。但是共享锁不允许其它连接对数据库进行写操作。 shared lock存在于操作系统磁盘缓存,而不是磁盘本身。文件锁的本质只是操作系统的内核数据结构,当操作系统崩溃或掉电时,这些内核数据也会随之消失。

    在对数据进行修改操作之前,先要获取数据库文件的Reserved Lock,Reserved Lock和shared lock的相似之处在于,它们都允许其它进程对数据库文件进行读操作。Reserved Lock和Shared Lock可以共存,但是只能是一个Reserved Lock和多个Shared Lock——多个Reserved Lock不能共存。所以,在同一时刻,只能进行一个写操作。 Reserved Lock意味着当前进程(连接)想修改数据库文件,但是还没开始修改操作,所以其它的进程可以读数据库,但不能写数据库。 document/2015-09-15/55f7c3b0c9365

    1. //事务指令的实现
    2. //p1为数据库文件的索引号---0为main database;1为temporary tables使用的文件
    3. //p2 不为0,一个写事务开始
    4. case OP_Transaction: {
    5. //数据库的索引号
    6. int i = pOp->p1;
    7. //指向数据库对应的btree
    8. Btree *pBt;
    9. assert( i>=0 && i<db->nDb );
    10. assert( (p->btreeMask & (1<<i))!=0 );
    11. //设置btree指针
    12. pBt = db->aDb[i].pBt;
    13. if( pBt ){
    14. //从这里btree开始事务,主要给文件加锁,并设置btree事务状态
    15. rc = sqlite3BtreeBeginTrans(pBt, pOp->p2);
    16. if( rc==SQLITE_BUSY ){
    17. p->pc = pc;
    18. p->rc = rc = SQLITE_BUSY;
    19. goto vdbe_return;
    20. }
    21. if( rc!=SQLITE_OK && rc!=SQLITE_READONLY /* && rc!=SQLITE_BUSY */ ){
    22. goto abort_due_to_error;
    23. }
    24. }
    25. break;
    26. }
    27. //开始一个事务,如果第二个参数不为0,则一个写事务开始,否则是一个读事务
    28. //如果wrflag>=2,一个exclusive事务开始,此时别的连接不能访问数据库
    29. int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
    30. BtShared *pBt = p->pBt;
    31. int rc = SQLITE_OK;
    32. btreeIntegrity(p);
    33. /* If the btree is already in a write-transaction, or it
    34. ** is already in a read-transaction and a read-transaction
    35. ** is requested, this is a no-op.
    36. */
    37. //如果b-tree处于一个写事务;或者处于一个读事务,一个读事务又请求,则返回SQLITE_OK
    38. if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
    39. return SQLITE_OK;
    40. }
    41. /* Write transactions are not possible on a read-only database */
    42. //写事务不能访问只读数据库
    43. if( pBt->readOnly && wrflag ){
    44. return SQLITE_READONLY;
    45. }
    46. /* If another database handle has already opened a write transaction
    47. ** on this shared-btree structure and a second write transaction is
    48. ** requested, return SQLITE_BUSY.
    49. */
    50. //如果数据库已存在一个写事务,则该写事务请求时返回SQLITE_BUSY
    51. if( pBt->inTransaction==TRANS_WRITE && wrflag ){
    52. return SQLITE_BUSY;
    53. }
    54. do {
    55. //如果数据库对应btree的第一个页面还没读进内存
    56. //则把该页面读进内存,数据库也相应的加read lock
    57. if( pBt->pPage1==0 ){
    58. //加read lock,并读页面到内存
    59. rc = lockBtree(pBt);
    60. }
    61. if( rc==SQLITE_OK && wrflag ){
    62. //对数据库文件加RESERVED_LOCK锁
    63. rc = sqlite3pager_begin(pBt->pPage1->aData, wrflag>1);
    64. if( rc==SQLITE_OK ){
    65. rc = newDatabase(pBt);
    66. }
    67. }
    68. if( rc==SQLITE_OK ){
    69. if( wrflag ) pBt->inStmt = 0;
    70. }else{
    71. unlockBtreeIfUnused(pBt);
    72. }
    73. }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
    74. sqlite3InvokeBusyHandler(pBt->pBusyHandler) );
    75. if( rc==SQLITE_OK ){
    76. if( p->inTrans==TRANS_NONE ){
    77. //btree的事务数加1
    78. }
    79. //设置btree事务状态
    80. p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
    81. pBt->inTransaction = p->inTrans;
    82. }
    83. }
    84. btreeIntegrity(p);
    85. return rc;
    86. }
    87. /*
    88. **获取数据库的写锁,发生以下情况时去除写锁:
    89. ** * sqlite3pager_commit() is called.
    90. ** * sqlite3pager_rollback() is called.
    91. ** * sqlite3pager_close() is called.
    92. ** * sqlite3pager_unref() is called to on every outstanding page.
    93. ** pData指向数据库的打开的页面,此时并不修改,仅仅只是获取
    94. ** 相应的pager,检查它是否处于read-lock状态。
    95. **如果打开的不是临时文件,则打开日志文件.
    96. **如果数据库已经处于写状态,则do nothing
    97. */
    98. int sqlite3pager_begin(void *pData, int exFlag){
    99. PgHdr *pPg = DATA_TO_PGHDR(pData);
    100. Pager *pPager = pPg->pPager;
    101. int rc = SQLITE_OK;
    102. assert( pPg->nRef>0 );
    103. assert( pPager->state!=PAGER_UNLOCK );
    104. //pager已经处于share状态
    105. if( pPager->state==PAGER_SHARED ){
    106. assert( pPager->aInJournal==0 );
    107. if( MEMDB ){
    108. pPager->state = PAGER_EXCLUSIVE;
    109. pPager->origDbSize = pPager->dbSize;
    110. }else{
    111. //对文件加 RESERVED_LOCK
    112. rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK);
    113. if( rc==SQLITE_OK ){
    114. //设置pager的状态
    115. pPager->state = PAGER_RESERVED;
    116. if( exFlag ){
    117. rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
    118. }
    119. }
    120. if( rc!=SQLITE_OK ){
    121. return rc;
    122. }
    123. pPager->dirtyCache = 0;
    124. TRACE2("TRANSACTION %d\n", PAGERID(pPager));
    125. //使用日志,不是临时文件,则打开日志文件
    126. if( pPager->useJournal && !pPager->tempFile ){
    127. //为pager打开日志文件,pager应该处于RESERVED或EXCLUSIVE状态
    128. //会向日志文件写入header
    129. rc = pager_open_journal(pPager);
    130. }
    131. }
    132. }
    133. return rc;
    134. }
    135. //创建日志文件,pager应该处于RESERVED或EXCLUSIVE状态
    136. static int pager_open_journal(Pager *pPager){
    137. int rc;
    138. assert( !MEMDB );
    139. assert( pPager->state>=PAGER_RESERVED );
    140. assert( pPager->journalOpen==0 );
    141. assert( pPager->useJournal );
    142. assert( pPager->aInJournal==0 );
    143. sqlite3pager_pagecount(pPager);
    144. //日志文件页面位图
    145. pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 );
    146. if( pPager->aInJournal==0 ){
    147. rc = SQLITE_NOMEM;
    148. goto failed_to_open_journal;
    149. }
    150. //打开日志文件
    151. rc = sqlite3OsOpenExclusive(pPager->zJournal, &pPager->jfd,
    152. pPager->tempFile);
    153. //日志文件的位置指针
    154. pPager->journalOff = 0;
    155. pPager->setMaster = 0;
    156. pPager->journalHdr = 0;
    157. if( rc!=SQLITE_OK ){
    158. goto failed_to_open_journal;
    159. }
    160. /*一般来说,os此时创建的文件位于磁盘缓存,并没有实际
    161. **存在于磁盘,下面三个操作就是为了把结果写入磁盘,而对于
    162. **windows系统来说,并没有提供相应API,所以实际上没有意义.
    163. */
    164. //fullSync操作对windows没有意义
    165. sqlite3OsSetFullSync(pPager->jfd, pPager->full_fsync);
    166. sqlite3OsSetFullSync(pPager->fd, pPager->full_fsync);
    167. /* Attempt to open a file descriptor for the directory that contains a file.
    168. **This file descriptor can be used to fsync() the directory
    169. */
    170. pPager->journalOpen = 1;
    171. pPager->journalStarted = 0;
    172. pPager->needSync = 0;
    173. pPager->alwaysRollback = 0;
    174. pPager->nRec = 0;
    175. if( pPager->errCode ){
    176. rc = pPager->errCode;
    177. goto failed_to_open_journal;
    178. }
    179. pPager->origDbSize = pPager->dbSize;
    180. //写入日志文件的header---24个字节
    181. rc = writeJournalHdr(pPager);
    182. if( pPager->stmtAutoopen && rc==SQLITE_OK ){
    183. rc = sqlite3pager_stmt_begin(pPager);
    184. }
    185. if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM ){
    186. rc = pager_unwritelock(pPager);
    187. if( rc==SQLITE_OK ){
    188. rc = SQLITE_FULL;
    189. }
    190. }
    191. return rc;
    192. failed_to_open_journal:
    193. sqliteFree(pPager->aInJournal);
    194. pPager->aInJournal = 0;
    195. if( rc==SQLITE_NOMEM ){
    196. /* If this was a malloc() failure, then we will not be closing the pager
    197. ** file. So delete any journal file we may have just created. Otherwise,
    198. ** the system will get confused, we have a read-lock on the file and a
    199. ** mysterious journal has appeared in the filesystem.
    200. */
    201. sqlite3OsDelete(pPager->zJournal);
    202. }else{
    203. sqlite3OsUnlock(pPager->fd, NO_LOCK);
    204. pPager->state = PAGER_UNLOCK;
    205. }
    206. return rc;
    207. }
    208. /*写入日志文件头
    209. **journal header的格式如下:
    210. ** - 8 bytes: 标志日志文件的魔数
    211. ** - 4 bytes: 日志文件中记录数
    212. ** - 4 bytes: Random number used for page hash.
    213. ** - 4 bytes: 原来数据库的大小(kb)
    214. ** - 4 bytes: 扇区大小512byte
    215. */
    216. static int writeJournalHdr(Pager *pPager){
    217. //日志文件头
    218. char zHeader[sizeof(aJournalMagic)+16];
    219. int rc = seekJournalHdr(pPager);
    220. if( rc ) return rc;
    221. pPager->journalHdr = pPager->journalOff;
    222. if( pPager->stmtHdrOff==0 ){
    223. pPager->stmtHdrOff = pPager->journalHdr;
    224. }
    225. //设置文件指针指向header之后
    226. pPager->journalOff += JOURNAL_HDR_SZ(pPager);
    227. /* FIX ME:
    228. **
    229. ** Possibly for a pager not in no-sync mode, the journal magic should not
    230. ** be written until nRec is filled in as part of next syncJournal().
    231. **
    232. ** Actually maybe the whole journal header should be delayed until that
    233. ** point. Think about this.
    234. */
    235. memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic));
    236. /* The nRec Field. 0xFFFFFFFF for no-sync journals. */
    237. put32bits(&zHeader[sizeof(aJournalMagic)], pPager->noSync ? 0xffffffff : 0);
    238. /* The random check-hash initialiser */
    239. sqlite3Randomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
    240. put32bits(&zHeader[sizeof(aJournalMagic)+4], pPager->cksumInit);
    241. /* The initial database size */
    242. put32bits(&zHeader[sizeof(aJournalMagic)+8], pPager->dbSize);
    243. /* The assumed sector size for this process */
    244. put32bits(&zHeader[sizeof(aJournalMagic)+12], pPager->sectorSize);
    245. //写入文件头
    246. rc = sqlite3OsWrite(pPager->jfd, zHeader, sizeof(zHeader));
    247. /* The journal header has been written successfully. Seek the journal
    248. ** file descriptor to the end of the journal header sector.
    249. */
    250. if( rc==SQLITE_OK ){
    251. rc = sqlite3OsSeek(pPager->jfd, pPager->journalOff-1);
    252. if( rc==SQLITE_OK ){
    253. rc = sqlite3OsWrite(pPager->jfd, "\000", 1);
    254. }
    255. }
    256. return rc;

    其实现过程如下图所示: