db.go 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179
  1. // Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
  2. // All rights reserved.
  3. //
  4. // Use of this source code is governed by a BSD-style license that can be
  5. // found in the LICENSE file.
  6. package leveldb
  7. import (
  8. "container/list"
  9. "fmt"
  10. "io"
  11. "os"
  12. "runtime"
  13. "strings"
  14. "sync"
  15. "sync/atomic"
  16. "time"
  17. "github.com/syndtr/goleveldb/leveldb/errors"
  18. "github.com/syndtr/goleveldb/leveldb/iterator"
  19. "github.com/syndtr/goleveldb/leveldb/journal"
  20. "github.com/syndtr/goleveldb/leveldb/memdb"
  21. "github.com/syndtr/goleveldb/leveldb/opt"
  22. "github.com/syndtr/goleveldb/leveldb/storage"
  23. "github.com/syndtr/goleveldb/leveldb/table"
  24. "github.com/syndtr/goleveldb/leveldb/util"
  25. )
  26. // DB is a LevelDB database.
  27. type DB struct {
  28. // Need 64-bit alignment.
  29. seq uint64
  30. // Stats. Need 64-bit alignment.
  31. cWriteDelay int64 // The cumulative duration of write delays
  32. cWriteDelayN int32 // The cumulative number of write delays
  33. inWritePaused int32 // The indicator whether write operation is paused by compaction
  34. aliveSnaps, aliveIters int32
  35. // Session.
  36. s *session
  37. // MemDB.
  38. memMu sync.RWMutex
  39. memPool chan *memdb.DB
  40. mem, frozenMem *memDB
  41. journal *journal.Writer
  42. journalWriter storage.Writer
  43. journalFd storage.FileDesc
  44. frozenJournalFd storage.FileDesc
  45. frozenSeq uint64
  46. // Snapshot.
  47. snapsMu sync.Mutex
  48. snapsList *list.List
  49. // Write.
  50. batchPool sync.Pool
  51. writeMergeC chan writeMerge
  52. writeMergedC chan bool
  53. writeLockC chan struct{}
  54. writeAckC chan error
  55. writeDelay time.Duration
  56. writeDelayN int
  57. tr *Transaction
  58. // Compaction.
  59. compCommitLk sync.Mutex
  60. tcompCmdC chan cCmd
  61. tcompPauseC chan chan<- struct{}
  62. mcompCmdC chan cCmd
  63. compErrC chan error
  64. compPerErrC chan error
  65. compErrSetC chan error
  66. compWriteLocking bool
  67. compStats cStats
  68. memdbMaxLevel int // For testing.
  69. // Close.
  70. closeW sync.WaitGroup
  71. closeC chan struct{}
  72. closed uint32
  73. closer io.Closer
  74. }
  75. func openDB(s *session) (*DB, error) {
  76. s.log("db@open opening")
  77. start := time.Now()
  78. db := &DB{
  79. s: s,
  80. // Initial sequence
  81. seq: s.stSeqNum,
  82. // MemDB
  83. memPool: make(chan *memdb.DB, 1),
  84. // Snapshot
  85. snapsList: list.New(),
  86. // Write
  87. batchPool: sync.Pool{New: newBatch},
  88. writeMergeC: make(chan writeMerge),
  89. writeMergedC: make(chan bool),
  90. writeLockC: make(chan struct{}, 1),
  91. writeAckC: make(chan error),
  92. // Compaction
  93. tcompCmdC: make(chan cCmd),
  94. tcompPauseC: make(chan chan<- struct{}),
  95. mcompCmdC: make(chan cCmd),
  96. compErrC: make(chan error),
  97. compPerErrC: make(chan error),
  98. compErrSetC: make(chan error),
  99. // Close
  100. closeC: make(chan struct{}),
  101. }
  102. // Read-only mode.
  103. readOnly := s.o.GetReadOnly()
  104. if readOnly {
  105. // Recover journals (read-only mode).
  106. if err := db.recoverJournalRO(); err != nil {
  107. return nil, err
  108. }
  109. } else {
  110. // Recover journals.
  111. if err := db.recoverJournal(); err != nil {
  112. return nil, err
  113. }
  114. // Remove any obsolete files.
  115. if err := db.checkAndCleanFiles(); err != nil {
  116. // Close journal.
  117. if db.journal != nil {
  118. db.journal.Close()
  119. db.journalWriter.Close()
  120. }
  121. return nil, err
  122. }
  123. }
  124. // Doesn't need to be included in the wait group.
  125. go db.compactionError()
  126. go db.mpoolDrain()
  127. if readOnly {
  128. db.SetReadOnly()
  129. } else {
  130. db.closeW.Add(2)
  131. go db.tCompaction()
  132. go db.mCompaction()
  133. // go db.jWriter()
  134. }
  135. s.logf("db@open done T·%v", time.Since(start))
  136. runtime.SetFinalizer(db, (*DB).Close)
  137. return db, nil
  138. }
  139. // Open opens or creates a DB for the given storage.
  140. // The DB will be created if not exist, unless ErrorIfMissing is true.
  141. // Also, if ErrorIfExist is true and the DB exist Open will returns
  142. // os.ErrExist error.
  143. //
  144. // Open will return an error with type of ErrCorrupted if corruption
  145. // detected in the DB. Use errors.IsCorrupted to test whether an error is
  146. // due to corruption. Corrupted DB can be recovered with Recover function.
  147. //
  148. // The returned DB instance is safe for concurrent use.
  149. // The DB must be closed after use, by calling Close method.
  150. func Open(stor storage.Storage, o *opt.Options) (db *DB, err error) {
  151. s, err := newSession(stor, o)
  152. if err != nil {
  153. return
  154. }
  155. defer func() {
  156. if err != nil {
  157. s.close()
  158. s.release()
  159. }
  160. }()
  161. err = s.recover()
  162. if err != nil {
  163. if !os.IsNotExist(err) || s.o.GetErrorIfMissing() || s.o.GetReadOnly() {
  164. return
  165. }
  166. err = s.create()
  167. if err != nil {
  168. return
  169. }
  170. } else if s.o.GetErrorIfExist() {
  171. err = os.ErrExist
  172. return
  173. }
  174. return openDB(s)
  175. }
  176. // OpenFile opens or creates a DB for the given path.
  177. // The DB will be created if not exist, unless ErrorIfMissing is true.
  178. // Also, if ErrorIfExist is true and the DB exist OpenFile will returns
  179. // os.ErrExist error.
  180. //
  181. // OpenFile uses standard file-system backed storage implementation as
  182. // described in the leveldb/storage package.
  183. //
  184. // OpenFile will return an error with type of ErrCorrupted if corruption
  185. // detected in the DB. Use errors.IsCorrupted to test whether an error is
  186. // due to corruption. Corrupted DB can be recovered with Recover function.
  187. //
  188. // The returned DB instance is safe for concurrent use.
  189. // The DB must be closed after use, by calling Close method.
  190. func OpenFile(path string, o *opt.Options) (db *DB, err error) {
  191. stor, err := storage.OpenFile(path, o.GetReadOnly())
  192. if err != nil {
  193. return
  194. }
  195. db, err = Open(stor, o)
  196. if err != nil {
  197. stor.Close()
  198. } else {
  199. db.closer = stor
  200. }
  201. return
  202. }
  203. // Recover recovers and opens a DB with missing or corrupted manifest files
  204. // for the given storage. It will ignore any manifest files, valid or not.
  205. // The DB must already exist or it will returns an error.
  206. // Also, Recover will ignore ErrorIfMissing and ErrorIfExist options.
  207. //
  208. // The returned DB instance is safe for concurrent use.
  209. // The DB must be closed after use, by calling Close method.
  210. func Recover(stor storage.Storage, o *opt.Options) (db *DB, err error) {
  211. s, err := newSession(stor, o)
  212. if err != nil {
  213. return
  214. }
  215. defer func() {
  216. if err != nil {
  217. s.close()
  218. s.release()
  219. }
  220. }()
  221. err = recoverTable(s, o)
  222. if err != nil {
  223. return
  224. }
  225. return openDB(s)
  226. }
  227. // RecoverFile recovers and opens a DB with missing or corrupted manifest files
  228. // for the given path. It will ignore any manifest files, valid or not.
  229. // The DB must already exist or it will returns an error.
  230. // Also, Recover will ignore ErrorIfMissing and ErrorIfExist options.
  231. //
  232. // RecoverFile uses standard file-system backed storage implementation as described
  233. // in the leveldb/storage package.
  234. //
  235. // The returned DB instance is safe for concurrent use.
  236. // The DB must be closed after use, by calling Close method.
  237. func RecoverFile(path string, o *opt.Options) (db *DB, err error) {
  238. stor, err := storage.OpenFile(path, false)
  239. if err != nil {
  240. return
  241. }
  242. db, err = Recover(stor, o)
  243. if err != nil {
  244. stor.Close()
  245. } else {
  246. db.closer = stor
  247. }
  248. return
  249. }
  250. func recoverTable(s *session, o *opt.Options) error {
  251. o = dupOptions(o)
  252. // Mask StrictReader, lets StrictRecovery doing its job.
  253. o.Strict &= ^opt.StrictReader
  254. // Get all tables and sort it by file number.
  255. fds, err := s.stor.List(storage.TypeTable)
  256. if err != nil {
  257. return err
  258. }
  259. sortFds(fds)
  260. var (
  261. maxSeq uint64
  262. recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int
  263. // We will drop corrupted table.
  264. strict = o.GetStrict(opt.StrictRecovery)
  265. noSync = o.GetNoSync()
  266. rec = &sessionRecord{}
  267. bpool = util.NewBufferPool(o.GetBlockSize() + 5)
  268. )
  269. buildTable := func(iter iterator.Iterator) (tmpFd storage.FileDesc, size int64, err error) {
  270. tmpFd = s.newTemp()
  271. writer, err := s.stor.Create(tmpFd)
  272. if err != nil {
  273. return
  274. }
  275. defer func() {
  276. writer.Close()
  277. if err != nil {
  278. s.stor.Remove(tmpFd)
  279. tmpFd = storage.FileDesc{}
  280. }
  281. }()
  282. // Copy entries.
  283. tw := table.NewWriter(writer, o)
  284. for iter.Next() {
  285. key := iter.Key()
  286. if validInternalKey(key) {
  287. err = tw.Append(key, iter.Value())
  288. if err != nil {
  289. return
  290. }
  291. }
  292. }
  293. err = iter.Error()
  294. if err != nil && !errors.IsCorrupted(err) {
  295. return
  296. }
  297. err = tw.Close()
  298. if err != nil {
  299. return
  300. }
  301. if !noSync {
  302. err = writer.Sync()
  303. if err != nil {
  304. return
  305. }
  306. }
  307. size = int64(tw.BytesLen())
  308. return
  309. }
  310. recoverTable := func(fd storage.FileDesc) error {
  311. s.logf("table@recovery recovering @%d", fd.Num)
  312. reader, err := s.stor.Open(fd)
  313. if err != nil {
  314. return err
  315. }
  316. var closed bool
  317. defer func() {
  318. if !closed {
  319. reader.Close()
  320. }
  321. }()
  322. // Get file size.
  323. size, err := reader.Seek(0, 2)
  324. if err != nil {
  325. return err
  326. }
  327. var (
  328. tSeq uint64
  329. tgoodKey, tcorruptedKey, tcorruptedBlock int
  330. imin, imax []byte
  331. )
  332. tr, err := table.NewReader(reader, size, fd, nil, bpool, o)
  333. if err != nil {
  334. return err
  335. }
  336. iter := tr.NewIterator(nil, nil)
  337. if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok {
  338. itererr.SetErrorCallback(func(err error) {
  339. if errors.IsCorrupted(err) {
  340. s.logf("table@recovery block corruption @%d %q", fd.Num, err)
  341. tcorruptedBlock++
  342. }
  343. })
  344. }
  345. // Scan the table.
  346. for iter.Next() {
  347. key := iter.Key()
  348. _, seq, _, kerr := parseInternalKey(key)
  349. if kerr != nil {
  350. tcorruptedKey++
  351. continue
  352. }
  353. tgoodKey++
  354. if seq > tSeq {
  355. tSeq = seq
  356. }
  357. if imin == nil {
  358. imin = append([]byte{}, key...)
  359. }
  360. imax = append(imax[:0], key...)
  361. }
  362. if err := iter.Error(); err != nil && !errors.IsCorrupted(err) {
  363. iter.Release()
  364. return err
  365. }
  366. iter.Release()
  367. goodKey += tgoodKey
  368. corruptedKey += tcorruptedKey
  369. corruptedBlock += tcorruptedBlock
  370. if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) {
  371. droppedTable++
  372. s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
  373. return nil
  374. }
  375. if tgoodKey > 0 {
  376. if tcorruptedKey > 0 || tcorruptedBlock > 0 {
  377. // Rebuild the table.
  378. s.logf("table@recovery rebuilding @%d", fd.Num)
  379. iter := tr.NewIterator(nil, nil)
  380. tmpFd, newSize, err := buildTable(iter)
  381. iter.Release()
  382. if err != nil {
  383. return err
  384. }
  385. closed = true
  386. reader.Close()
  387. if err := s.stor.Rename(tmpFd, fd); err != nil {
  388. return err
  389. }
  390. size = newSize
  391. }
  392. if tSeq > maxSeq {
  393. maxSeq = tSeq
  394. }
  395. recoveredKey += tgoodKey
  396. // Add table to level 0.
  397. rec.addTable(0, fd.Num, size, imin, imax)
  398. s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
  399. } else {
  400. droppedTable++
  401. s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", fd.Num, tcorruptedKey, tcorruptedBlock, size)
  402. }
  403. return nil
  404. }
  405. // Recover all tables.
  406. if len(fds) > 0 {
  407. s.logf("table@recovery F·%d", len(fds))
  408. // Mark file number as used.
  409. s.markFileNum(fds[len(fds)-1].Num)
  410. for _, fd := range fds {
  411. if err := recoverTable(fd); err != nil {
  412. return err
  413. }
  414. }
  415. s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(fds), recoveredKey, goodKey, corruptedKey, maxSeq)
  416. }
  417. // Set sequence number.
  418. rec.setSeqNum(maxSeq)
  419. // Create new manifest.
  420. if err := s.create(); err != nil {
  421. return err
  422. }
  423. // Commit.
  424. return s.commit(rec)
  425. }
  426. func (db *DB) recoverJournal() error {
  427. // Get all journals and sort it by file number.
  428. rawFds, err := db.s.stor.List(storage.TypeJournal)
  429. if err != nil {
  430. return err
  431. }
  432. sortFds(rawFds)
  433. // Journals that will be recovered.
  434. var fds []storage.FileDesc
  435. for _, fd := range rawFds {
  436. if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
  437. fds = append(fds, fd)
  438. }
  439. }
  440. var (
  441. ofd storage.FileDesc // Obsolete file.
  442. rec = &sessionRecord{}
  443. )
  444. // Recover journals.
  445. if len(fds) > 0 {
  446. db.logf("journal@recovery F·%d", len(fds))
  447. // Mark file number as used.
  448. db.s.markFileNum(fds[len(fds)-1].Num)
  449. var (
  450. // Options.
  451. strict = db.s.o.GetStrict(opt.StrictJournal)
  452. checksum = db.s.o.GetStrict(opt.StrictJournalChecksum)
  453. writeBuffer = db.s.o.GetWriteBuffer()
  454. jr *journal.Reader
  455. mdb = memdb.New(db.s.icmp, writeBuffer)
  456. buf = &util.Buffer{}
  457. batchSeq uint64
  458. batchLen int
  459. )
  460. for _, fd := range fds {
  461. db.logf("journal@recovery recovering @%d", fd.Num)
  462. fr, err := db.s.stor.Open(fd)
  463. if err != nil {
  464. return err
  465. }
  466. // Create or reset journal reader instance.
  467. if jr == nil {
  468. jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
  469. } else {
  470. jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
  471. }
  472. // Flush memdb and remove obsolete journal file.
  473. if !ofd.Zero() {
  474. if mdb.Len() > 0 {
  475. if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
  476. fr.Close()
  477. return err
  478. }
  479. }
  480. rec.setJournalNum(fd.Num)
  481. rec.setSeqNum(db.seq)
  482. if err := db.s.commit(rec); err != nil {
  483. fr.Close()
  484. return err
  485. }
  486. rec.resetAddedTables()
  487. db.s.stor.Remove(ofd)
  488. ofd = storage.FileDesc{}
  489. }
  490. // Replay journal to memdb.
  491. mdb.Reset()
  492. for {
  493. r, err := jr.Next()
  494. if err != nil {
  495. if err == io.EOF {
  496. break
  497. }
  498. fr.Close()
  499. return errors.SetFd(err, fd)
  500. }
  501. buf.Reset()
  502. if _, err := buf.ReadFrom(r); err != nil {
  503. if err == io.ErrUnexpectedEOF {
  504. // This is error returned due to corruption, with strict == false.
  505. continue
  506. }
  507. fr.Close()
  508. return errors.SetFd(err, fd)
  509. }
  510. batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb)
  511. if err != nil {
  512. if !strict && errors.IsCorrupted(err) {
  513. db.s.logf("journal error: %v (skipped)", err)
  514. // We won't apply sequence number as it might be corrupted.
  515. continue
  516. }
  517. fr.Close()
  518. return errors.SetFd(err, fd)
  519. }
  520. // Save sequence number.
  521. db.seq = batchSeq + uint64(batchLen)
  522. // Flush it if large enough.
  523. if mdb.Size() >= writeBuffer {
  524. if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
  525. fr.Close()
  526. return err
  527. }
  528. mdb.Reset()
  529. }
  530. }
  531. fr.Close()
  532. ofd = fd
  533. }
  534. // Flush the last memdb.
  535. if mdb.Len() > 0 {
  536. if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil {
  537. return err
  538. }
  539. }
  540. }
  541. // Create a new journal.
  542. if _, err := db.newMem(0); err != nil {
  543. return err
  544. }
  545. // Commit.
  546. rec.setJournalNum(db.journalFd.Num)
  547. rec.setSeqNum(db.seq)
  548. if err := db.s.commit(rec); err != nil {
  549. // Close journal on error.
  550. if db.journal != nil {
  551. db.journal.Close()
  552. db.journalWriter.Close()
  553. }
  554. return err
  555. }
  556. // Remove the last obsolete journal file.
  557. if !ofd.Zero() {
  558. db.s.stor.Remove(ofd)
  559. }
  560. return nil
  561. }
  562. func (db *DB) recoverJournalRO() error {
  563. // Get all journals and sort it by file number.
  564. rawFds, err := db.s.stor.List(storage.TypeJournal)
  565. if err != nil {
  566. return err
  567. }
  568. sortFds(rawFds)
  569. // Journals that will be recovered.
  570. var fds []storage.FileDesc
  571. for _, fd := range rawFds {
  572. if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum {
  573. fds = append(fds, fd)
  574. }
  575. }
  576. var (
  577. // Options.
  578. strict = db.s.o.GetStrict(opt.StrictJournal)
  579. checksum = db.s.o.GetStrict(opt.StrictJournalChecksum)
  580. writeBuffer = db.s.o.GetWriteBuffer()
  581. mdb = memdb.New(db.s.icmp, writeBuffer)
  582. )
  583. // Recover journals.
  584. if len(fds) > 0 {
  585. db.logf("journal@recovery RO·Mode F·%d", len(fds))
  586. var (
  587. jr *journal.Reader
  588. buf = &util.Buffer{}
  589. batchSeq uint64
  590. batchLen int
  591. )
  592. for _, fd := range fds {
  593. db.logf("journal@recovery recovering @%d", fd.Num)
  594. fr, err := db.s.stor.Open(fd)
  595. if err != nil {
  596. return err
  597. }
  598. // Create or reset journal reader instance.
  599. if jr == nil {
  600. jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum)
  601. } else {
  602. jr.Reset(fr, dropper{db.s, fd}, strict, checksum)
  603. }
  604. // Replay journal to memdb.
  605. for {
  606. r, err := jr.Next()
  607. if err != nil {
  608. if err == io.EOF {
  609. break
  610. }
  611. fr.Close()
  612. return errors.SetFd(err, fd)
  613. }
  614. buf.Reset()
  615. if _, err := buf.ReadFrom(r); err != nil {
  616. if err == io.ErrUnexpectedEOF {
  617. // This is error returned due to corruption, with strict == false.
  618. continue
  619. }
  620. fr.Close()
  621. return errors.SetFd(err, fd)
  622. }
  623. batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb)
  624. if err != nil {
  625. if !strict && errors.IsCorrupted(err) {
  626. db.s.logf("journal error: %v (skipped)", err)
  627. // We won't apply sequence number as it might be corrupted.
  628. continue
  629. }
  630. fr.Close()
  631. return errors.SetFd(err, fd)
  632. }
  633. // Save sequence number.
  634. db.seq = batchSeq + uint64(batchLen)
  635. }
  636. fr.Close()
  637. }
  638. }
  639. // Set memDB.
  640. db.mem = &memDB{db: db, DB: mdb, ref: 1}
  641. return nil
  642. }
  643. func memGet(mdb *memdb.DB, ikey internalKey, icmp *iComparer) (ok bool, mv []byte, err error) {
  644. mk, mv, err := mdb.Find(ikey)
  645. if err == nil {
  646. ukey, _, kt, kerr := parseInternalKey(mk)
  647. if kerr != nil {
  648. // Shouldn't have had happen.
  649. panic(kerr)
  650. }
  651. if icmp.uCompare(ukey, ikey.ukey()) == 0 {
  652. if kt == keyTypeDel {
  653. return true, nil, ErrNotFound
  654. }
  655. return true, mv, nil
  656. }
  657. } else if err != ErrNotFound {
  658. return true, nil, err
  659. }
  660. return
  661. }
  662. func (db *DB) get(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) {
  663. ikey := makeInternalKey(nil, key, seq, keyTypeSeek)
  664. if auxm != nil {
  665. if ok, mv, me := memGet(auxm, ikey, db.s.icmp); ok {
  666. return append([]byte{}, mv...), me
  667. }
  668. }
  669. em, fm := db.getMems()
  670. for _, m := range [...]*memDB{em, fm} {
  671. if m == nil {
  672. continue
  673. }
  674. defer m.decref()
  675. if ok, mv, me := memGet(m.DB, ikey, db.s.icmp); ok {
  676. return append([]byte{}, mv...), me
  677. }
  678. }
  679. v := db.s.version()
  680. value, cSched, err := v.get(auxt, ikey, ro, false)
  681. v.release()
  682. if cSched {
  683. // Trigger table compaction.
  684. db.compTrigger(db.tcompCmdC)
  685. }
  686. return
  687. }
  688. func nilIfNotFound(err error) error {
  689. if err == ErrNotFound {
  690. return nil
  691. }
  692. return err
  693. }
  694. func (db *DB) has(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err error) {
  695. ikey := makeInternalKey(nil, key, seq, keyTypeSeek)
  696. if auxm != nil {
  697. if ok, _, me := memGet(auxm, ikey, db.s.icmp); ok {
  698. return me == nil, nilIfNotFound(me)
  699. }
  700. }
  701. em, fm := db.getMems()
  702. for _, m := range [...]*memDB{em, fm} {
  703. if m == nil {
  704. continue
  705. }
  706. defer m.decref()
  707. if ok, _, me := memGet(m.DB, ikey, db.s.icmp); ok {
  708. return me == nil, nilIfNotFound(me)
  709. }
  710. }
  711. v := db.s.version()
  712. _, cSched, err := v.get(auxt, ikey, ro, true)
  713. v.release()
  714. if cSched {
  715. // Trigger table compaction.
  716. db.compTrigger(db.tcompCmdC)
  717. }
  718. if err == nil {
  719. ret = true
  720. } else if err == ErrNotFound {
  721. err = nil
  722. }
  723. return
  724. }
  725. // Get gets the value for the given key. It returns ErrNotFound if the
  726. // DB does not contains the key.
  727. //
  728. // The returned slice is its own copy, it is safe to modify the contents
  729. // of the returned slice.
  730. // It is safe to modify the contents of the argument after Get returns.
  731. func (db *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) {
  732. err = db.ok()
  733. if err != nil {
  734. return
  735. }
  736. se := db.acquireSnapshot()
  737. defer db.releaseSnapshot(se)
  738. return db.get(nil, nil, key, se.seq, ro)
  739. }
  740. // Has returns true if the DB does contains the given key.
  741. //
  742. // It is safe to modify the contents of the argument after Has returns.
  743. func (db *DB) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) {
  744. err = db.ok()
  745. if err != nil {
  746. return
  747. }
  748. se := db.acquireSnapshot()
  749. defer db.releaseSnapshot(se)
  750. return db.has(nil, nil, key, se.seq, ro)
  751. }
  752. // NewIterator returns an iterator for the latest snapshot of the
  753. // underlying DB.
  754. // The returned iterator is not safe for concurrent use, but it is safe to use
  755. // multiple iterators concurrently, with each in a dedicated goroutine.
  756. // It is also safe to use an iterator concurrently with modifying its
  757. // underlying DB. The resultant key/value pairs are guaranteed to be
  758. // consistent.
  759. //
  760. // Slice allows slicing the iterator to only contains keys in the given
  761. // range. A nil Range.Start is treated as a key before all keys in the
  762. // DB. And a nil Range.Limit is treated as a key after all keys in
  763. // the DB.
  764. //
  765. // WARNING: Any slice returned by interator (e.g. slice returned by calling
  766. // Iterator.Key() or Iterator.Key() methods), its content should not be modified
  767. // unless noted otherwise.
  768. //
  769. // The iterator must be released after use, by calling Release method.
  770. //
  771. // Also read Iterator documentation of the leveldb/iterator package.
  772. func (db *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator {
  773. if err := db.ok(); err != nil {
  774. return iterator.NewEmptyIterator(err)
  775. }
  776. se := db.acquireSnapshot()
  777. defer db.releaseSnapshot(se)
  778. // Iterator holds 'version' lock, 'version' is immutable so snapshot
  779. // can be released after iterator created.
  780. return db.newIterator(nil, nil, se.seq, slice, ro)
  781. }
  782. // GetSnapshot returns a latest snapshot of the underlying DB. A snapshot
  783. // is a frozen snapshot of a DB state at a particular point in time. The
  784. // content of snapshot are guaranteed to be consistent.
  785. //
  786. // The snapshot must be released after use, by calling Release method.
  787. func (db *DB) GetSnapshot() (*Snapshot, error) {
  788. if err := db.ok(); err != nil {
  789. return nil, err
  790. }
  791. return db.newSnapshot(), nil
  792. }
  793. // GetProperty returns value of the given property name.
  794. //
  795. // Property names:
  796. // leveldb.num-files-at-level{n}
  797. // Returns the number of files at level 'n'.
  798. // leveldb.stats
  799. // Returns statistics of the underlying DB.
  800. // leveldb.iostats
  801. // Returns statistics of effective disk read and write.
  802. // leveldb.writedelay
  803. // Returns cumulative write delay caused by compaction.
  804. // leveldb.sstables
  805. // Returns sstables list for each level.
  806. // leveldb.blockpool
  807. // Returns block pool stats.
  808. // leveldb.cachedblock
  809. // Returns size of cached block.
  810. // leveldb.openedtables
  811. // Returns number of opened tables.
  812. // leveldb.alivesnaps
  813. // Returns number of alive snapshots.
  814. // leveldb.aliveiters
  815. // Returns number of alive iterators.
  816. func (db *DB) GetProperty(name string) (value string, err error) {
  817. err = db.ok()
  818. if err != nil {
  819. return
  820. }
  821. const prefix = "leveldb."
  822. if !strings.HasPrefix(name, prefix) {
  823. return "", ErrNotFound
  824. }
  825. p := name[len(prefix):]
  826. v := db.s.version()
  827. defer v.release()
  828. numFilesPrefix := "num-files-at-level"
  829. switch {
  830. case strings.HasPrefix(p, numFilesPrefix):
  831. var level uint
  832. var rest string
  833. n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest)
  834. if n != 1 {
  835. err = ErrNotFound
  836. } else {
  837. value = fmt.Sprint(v.tLen(int(level)))
  838. }
  839. case p == "stats":
  840. value = "Compactions\n" +
  841. " Level | Tables | Size(MB) | Time(sec) | Read(MB) | Write(MB)\n" +
  842. "-------+------------+---------------+---------------+---------------+---------------\n"
  843. for level, tables := range v.levels {
  844. duration, read, write := db.compStats.getStat(level)
  845. if len(tables) == 0 && duration == 0 {
  846. continue
  847. }
  848. value += fmt.Sprintf(" %3d | %10d | %13.5f | %13.5f | %13.5f | %13.5f\n",
  849. level, len(tables), float64(tables.size())/1048576.0, duration.Seconds(),
  850. float64(read)/1048576.0, float64(write)/1048576.0)
  851. }
  852. case p == "iostats":
  853. value = fmt.Sprintf("Read(MB):%.5f Write(MB):%.5f",
  854. float64(db.s.stor.reads())/1048576.0,
  855. float64(db.s.stor.writes())/1048576.0)
  856. case p == "writedelay":
  857. writeDelayN, writeDelay := atomic.LoadInt32(&db.cWriteDelayN), time.Duration(atomic.LoadInt64(&db.cWriteDelay))
  858. paused := atomic.LoadInt32(&db.inWritePaused) == 1
  859. value = fmt.Sprintf("DelayN:%d Delay:%s Paused:%t", writeDelayN, writeDelay, paused)
  860. case p == "sstables":
  861. for level, tables := range v.levels {
  862. value += fmt.Sprintf("--- level %d ---\n", level)
  863. for _, t := range tables {
  864. value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.fd.Num, t.size, t.imin, t.imax)
  865. }
  866. }
  867. case p == "blockpool":
  868. value = fmt.Sprintf("%v", db.s.tops.bpool)
  869. case p == "cachedblock":
  870. if db.s.tops.bcache != nil {
  871. value = fmt.Sprintf("%d", db.s.tops.bcache.Size())
  872. } else {
  873. value = "<nil>"
  874. }
  875. case p == "openedtables":
  876. value = fmt.Sprintf("%d", db.s.tops.cache.Size())
  877. case p == "alivesnaps":
  878. value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveSnaps))
  879. case p == "aliveiters":
  880. value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveIters))
  881. default:
  882. err = ErrNotFound
  883. }
  884. return
  885. }
  886. // DBStats is database statistics.
  887. type DBStats struct {
  888. WriteDelayCount int32
  889. WriteDelayDuration time.Duration
  890. WritePaused bool
  891. AliveSnapshots int32
  892. AliveIterators int32
  893. IOWrite uint64
  894. IORead uint64
  895. BlockCacheSize int
  896. OpenedTablesCount int
  897. LevelSizes []int64
  898. LevelTablesCounts []int
  899. LevelRead []int64
  900. LevelWrite []int64
  901. LevelDurations []time.Duration
  902. }
  903. // Stats populates s with database statistics.
  904. func (db *DB) Stats(s *DBStats) error {
  905. err := db.ok()
  906. if err != nil {
  907. return err
  908. }
  909. s.IORead = db.s.stor.reads()
  910. s.IOWrite = db.s.stor.writes()
  911. s.WriteDelayCount = atomic.LoadInt32(&db.cWriteDelayN)
  912. s.WriteDelayDuration = time.Duration(atomic.LoadInt64(&db.cWriteDelay))
  913. s.WritePaused = atomic.LoadInt32(&db.inWritePaused) == 1
  914. s.OpenedTablesCount = db.s.tops.cache.Size()
  915. if db.s.tops.bcache != nil {
  916. s.BlockCacheSize = db.s.tops.bcache.Size()
  917. } else {
  918. s.BlockCacheSize = 0
  919. }
  920. s.AliveIterators = atomic.LoadInt32(&db.aliveIters)
  921. s.AliveSnapshots = atomic.LoadInt32(&db.aliveSnaps)
  922. s.LevelDurations = s.LevelDurations[:0]
  923. s.LevelRead = s.LevelRead[:0]
  924. s.LevelWrite = s.LevelWrite[:0]
  925. s.LevelSizes = s.LevelSizes[:0]
  926. s.LevelTablesCounts = s.LevelTablesCounts[:0]
  927. v := db.s.version()
  928. defer v.release()
  929. for level, tables := range v.levels {
  930. duration, read, write := db.compStats.getStat(level)
  931. if len(tables) == 0 && duration == 0 {
  932. continue
  933. }
  934. s.LevelDurations = append(s.LevelDurations, duration)
  935. s.LevelRead = append(s.LevelRead, read)
  936. s.LevelWrite = append(s.LevelWrite, write)
  937. s.LevelSizes = append(s.LevelSizes, tables.size())
  938. s.LevelTablesCounts = append(s.LevelTablesCounts, len(tables))
  939. }
  940. return nil
  941. }
  942. // SizeOf calculates approximate sizes of the given key ranges.
  943. // The length of the returned sizes are equal with the length of the given
  944. // ranges. The returned sizes measure storage space usage, so if the user
  945. // data compresses by a factor of ten, the returned sizes will be one-tenth
  946. // the size of the corresponding user data size.
  947. // The results may not include the sizes of recently written data.
  948. func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
  949. if err := db.ok(); err != nil {
  950. return nil, err
  951. }
  952. v := db.s.version()
  953. defer v.release()
  954. sizes := make(Sizes, 0, len(ranges))
  955. for _, r := range ranges {
  956. imin := makeInternalKey(nil, r.Start, keyMaxSeq, keyTypeSeek)
  957. imax := makeInternalKey(nil, r.Limit, keyMaxSeq, keyTypeSeek)
  958. start, err := v.offsetOf(imin)
  959. if err != nil {
  960. return nil, err
  961. }
  962. limit, err := v.offsetOf(imax)
  963. if err != nil {
  964. return nil, err
  965. }
  966. var size int64
  967. if limit >= start {
  968. size = limit - start
  969. }
  970. sizes = append(sizes, size)
  971. }
  972. return sizes, nil
  973. }
  974. // Close closes the DB. This will also releases any outstanding snapshot,
  975. // abort any in-flight compaction and discard open transaction.
  976. //
  977. // It is not safe to close a DB until all outstanding iterators are released.
  978. // It is valid to call Close multiple times. Other methods should not be
  979. // called after the DB has been closed.
  980. func (db *DB) Close() error {
  981. if !db.setClosed() {
  982. return ErrClosed
  983. }
  984. start := time.Now()
  985. db.log("db@close closing")
  986. // Clear the finalizer.
  987. runtime.SetFinalizer(db, nil)
  988. // Get compaction error.
  989. var err error
  990. select {
  991. case err = <-db.compErrC:
  992. if err == ErrReadOnly {
  993. err = nil
  994. }
  995. default:
  996. }
  997. // Signal all goroutines.
  998. close(db.closeC)
  999. // Discard open transaction.
  1000. if db.tr != nil {
  1001. db.tr.Discard()
  1002. }
  1003. // Acquire writer lock.
  1004. db.writeLockC <- struct{}{}
  1005. // Wait for all gorotines to exit.
  1006. db.closeW.Wait()
  1007. // Closes journal.
  1008. if db.journal != nil {
  1009. db.journal.Close()
  1010. db.journalWriter.Close()
  1011. db.journal = nil
  1012. db.journalWriter = nil
  1013. }
  1014. if db.writeDelayN > 0 {
  1015. db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
  1016. }
  1017. // Close session.
  1018. db.s.close()
  1019. db.logf("db@close done T·%v", time.Since(start))
  1020. db.s.release()
  1021. if db.closer != nil {
  1022. if err1 := db.closer.Close(); err == nil {
  1023. err = err1
  1024. }
  1025. db.closer = nil
  1026. }
  1027. // Clear memdbs.
  1028. db.clearMems()
  1029. return err
  1030. }