options.h 69 KB
Newer Older
1
// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
Siying Dong's avatar
Siying Dong committed
2
3
4
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
jorlow@chromium.org's avatar
jorlow@chromium.org committed
5
6
7
8
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

9
#pragma once
jorlow@chromium.org's avatar
jorlow@chromium.org committed
10
11

#include <stddef.h>
12
13
#include <stdint.h>
#include <limits>
14
15
#include <memory>
#include <string>
16
#include <unordered_map>
17
#include <vector>
18

19
#include "rocksdb/advanced_options.h"
20
21
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
22
#include "rocksdb/file_checksum.h"
23
#include "rocksdb/listener.h"
24
#include "rocksdb/universal_compaction.h"
25
26
#include "rocksdb/version.h"
#include "rocksdb/write_buffer_manager.h"
Tyler Harter's avatar
Tyler Harter committed
27

Dmitri Smirnov's avatar
Dmitri Smirnov committed
28
29
30
31
#ifdef max
#undef max
#endif

32
namespace ROCKSDB_NAMESPACE {
jorlow@chromium.org's avatar
jorlow@chromium.org committed
33
34

class Cache;
35
36
class CompactionFilter;
class CompactionFilterFactory;
jorlow@chromium.org's avatar
jorlow@chromium.org committed
37
class Comparator;
38
class ConcurrentTaskLimiter;
jorlow@chromium.org's avatar
jorlow@chromium.org committed
39
class Env;
40
enum InfoLogLevel : unsigned char;
41
class SstFileManager;
Sanjay Ghemawat's avatar
Sanjay Ghemawat committed
42
class FilterPolicy;
43
class Logger;
44
class MergeOperator;
jorlow@chromium.org's avatar
jorlow@chromium.org committed
45
class Snapshot;
46
class MemTableRepFactory;
Lei Jin's avatar
Lei Jin committed
47
class RateLimiter;
48
49
class Slice;
class Statistics;
50
class InternalKeyComparator;
51
class WalFilter;
52
class FileSystem;
53

54
55
56
57
58
59
60
enum class CpuPriority {
  kIdle = 0,
  kLow = 1,
  kNormal = 2,
  kHigh = 3,
};

jorlow@chromium.org's avatar
jorlow@chromium.org committed
61
62
63
64
// DB contents are stored in a set of blocks, each of which holds a
// sequence of key,value pairs.  Each block may be compressed before
// being stored in a file.  The following enum describes which
// compression method (if any) is used to compress a block.
sdong's avatar
sdong committed
65
enum CompressionType : unsigned char {
jorlow@chromium.org's avatar
jorlow@chromium.org committed
66
67
  // NOTE: do not change the values of existing entries, as these are
  // part of the persistent format on disk.
68
69
70
71
72
73
  kNoCompression = 0x0,
  kSnappyCompression = 0x1,
  kZlibCompression = 0x2,
  kBZip2Compression = 0x3,
  kLZ4Compression = 0x4,
  kLZ4HCCompression = 0x5,
74
  kXpressCompression = 0x6,
sdong's avatar
sdong committed
75
76
77
78
79
80
81
  kZSTD = 0x7,

  // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than
  // 0.8.0 or consider a possibility of downgrading the service or copying
  // the database files to another service running with an older version of
  // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will
  // eventually remove the option from the public API.
82
  kZSTDNotFinalCompression = 0x40,
83
84

  // kDisableCompressionOption is used to disable some compression options.
sdong's avatar
sdong committed
85
  kDisableCompressionOption = 0xff,
jorlow@chromium.org's avatar
jorlow@chromium.org committed
86
87
};

88
struct Options;
89
struct DbPath;
90

91
struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
sdong's avatar
sdong committed
92
93
94
95
96
  // The function recovers options to a previous version. Only 4.6 or later
  // versions are supported.
  ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
                                   int rocksdb_minor_version = 6);

97
  // Some functions that make it easier to optimize RocksDB
98
  // Use this if your DB is very small (like under 1GB) and you don't want to
sdong's avatar
sdong committed
99
  // spend lots of memory for memtables.
100
101
102
  // An optional cache object is passed in to be used as the block cache
  ColumnFamilyOptions* OptimizeForSmallDb(
      std::shared_ptr<Cache>* cache = nullptr);
103
104
105

  // Use this if you don't need to keep the data sorted, i.e. you'll never use
  // an iterator, only Put() and Get() API calls
106
107
  //
  // Not supported in ROCKSDB_LITE
108
  ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
109
110
111
112
113
114
115
116
117
118
119

  // Default values for some parameters in ColumnFamilyOptions are not
  // optimized for heavy workloads and big datasets, which means you might
  // observe write stalls under some conditions. As a starting point for tuning
  // RocksDB options, use the following two functions:
  // * OptimizeLevelStyleCompaction -- optimizes level style compaction
  // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
  // Universal style compaction is focused on reducing Write Amplification
  // Factor for big data sets, but increases Space Amplification. You can learn
  // more about the different styles here:
  // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
Igor Canadi's avatar
comment    
Igor Canadi committed
120
121
  // Make sure to also call IncreaseParallelism(), which will provide the
  // biggest performance gains.
122
123
  // Note: we might use more memory than memtable_memory_budget during high
  // write rate period
124
125
  //
  // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
126
127
128
129
130
  ColumnFamilyOptions* OptimizeLevelStyleCompaction(
      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
  ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
      uint64_t memtable_memory_budget = 512 * 1024 * 1024);

jorlow@chromium.org's avatar
jorlow@chromium.org committed
131
132
133
134
135
136
137
138
139
  // -------------------
  // Parameters that affect behavior

  // Comparator used to define the order of keys in the table.
  // Default: a comparator that uses lexicographic byte-wise ordering
  //
  // REQUIRES: The client must ensure that the comparator supplied
  // here has the same name and orders keys *exactly* the same as the
  // comparator provided to previous open calls on the same DB.
140
  const Comparator* comparator = BytewiseComparator();
jorlow@chromium.org's avatar
jorlow@chromium.org committed
141

142
143
144
145
146
147
148
149
  // REQUIRES: The client must provide a merge operator if Merge operation
  // needs to be accessed. Calling Merge on a DB without a merge operator
  // would result in Status::NotSupported. The client must ensure that the
  // merge operator supplied here has the same name and *exactly* the same
  // semantics as the merge operator provided to previous open calls on
  // the same DB. The only exception is reserved for upgrade, where a DB
  // previously without a merge operator is introduced to Merge operation
  // for the first time. It's necessary to specify a merge operator when
yizhu.sun's avatar
yizhu.sun committed
150
  // opening the DB in this case.
151
  // Default: nullptr
152
  std::shared_ptr<MergeOperator> merge_operator = nullptr;
153

154
  // A single CompactionFilter instance to call into during compaction.
155
156
  // Allows an application to modify/delete a key-value during background
  // compaction.
157
158
159
160
  //
  // If the client requires a new compaction filter to be used for different
  // compaction runs, it can specify compaction_filter_factory instead of this
  // option.  The client should specify only one of the two.
161
162
  // compaction_filter takes precedence over compaction_filter_factory if
  // client specifies both.
163
164
165
166
167
  //
  // If multithreaded compaction is being used, the supplied CompactionFilter
  // instance may be used from different threads concurrently and so should be
  // thread-safe.
  //
168
  // Default: nullptr
169
  const CompactionFilter* compaction_filter = nullptr;
170

171
172
173
174
175
176
177
  // This is a factory that provides compaction filter objects which allow
  // an application to modify/delete a key-value during background compaction.
  //
  // A new filter will be created on each compaction run.  If multithreaded
  // compaction is being used, each created CompactionFilter will only be used
  // from a single thread and so does not need to be thread-safe.
  //
178
  // Default: nullptr
179
  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
180

jorlow@chromium.org's avatar
jorlow@chromium.org committed
181
182
183
  // -------------------
  // Parameters that affect performance

184
185
  // Amount of data to build up in memory (backed by an unsorted log
  // on disk) before converting to a sorted on-disk file.
jorlow@chromium.org's avatar
jorlow@chromium.org committed
186
  //
187
  // Larger values increase performance, especially during bulk loads.
Abhishek Kona's avatar
Abhishek Kona committed
188
  // Up to max_write_buffer_number write buffers may be held in memory
189
  // at the same time,
190
  // so you may wish to adjust this parameter to control memory usage.
191
192
  // Also, a larger write buffer will result in a longer recovery time
  // the next time the database is opened.
jorlow@chromium.org's avatar
jorlow@chromium.org committed
193
  //
194
195
196
  // Note that write_buffer_size is enforced per column family.
  // See db_write_buffer_size for sharing memory across column families.
  //
sdong's avatar
sdong committed
197
  // Default: 64MB
Lei Jin's avatar
Lei Jin committed
198
199
  //
  // Dynamically changeable through SetOptions() API
200
  size_t write_buffer_size = 64 << 20;
jorlow@chromium.org's avatar
jorlow@chromium.org committed
201

202
  // Compress blocks using the specified compression algorithm.
203
  //
204
205
  // Default: kSnappyCompression, if it's supported. If snappy is not linked
  // with the library, the default is kNoCompression.
206
207
208
209
  //
  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
  //    ~200-500MB/s compression
  //    ~400-800MB/s decompression
210
  //
211
212
213
214
215
  // Note that these speeds are significantly faster than most
  // persistent storage speeds, and therefore it is typically never
  // worth switching to kNoCompression.  Even if the input data is
  // incompressible, the kSnappyCompression implementation will
  // efficiently detect that and will switch to uncompressed mode.
216
217
218
219
220
221
222
223
224
  //
  // If you do not set `compression_opts.level`, or set it to
  // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
  // default corresponding to `compression` as follows:
  //
  // - kZSTD: 3
  // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
  // - kLZ4HCCompression: 0
  // - For all others, we do not specify a compression level
225
226
  //
  // Dynamically changeable through SetOptions() API
227
228
  CompressionType compression;

229
  // Compression algorithm that will be used for the bottommost level that
230
  // contain files.
231
232
  //
  // Default: kDisableCompressionOption (Disabled)
233
  CompressionType bottommost_compression = kDisableCompressionOption;
234

235
236
237
238
239
  // different options for compression algorithms used by bottommost_compression
  // if it is enabled. To enable it, please see the definition of
  // CompressionOptions.
  CompressionOptions bottommost_compression_opts;

240
  // different options for compression algorithms
241
242
  CompressionOptions compression_opts;

243
244
245
246
247
248
249
250
  // Number of files to trigger level-0 compaction. A value <0 means that
  // level-0 compaction will not be triggered by number of files at all.
  //
  // Default: 4
  //
  // Dynamically changeable through SetOptions() API
  int level0_file_num_compaction_trigger = 4;

Tyler Harter's avatar
Tyler Harter committed
251
252
253
254
255
256
257
258
259
260
261
262
263
264
  // If non-nullptr, use the specified function to determine the
  // prefixes for keys.  These prefixes will be placed in the filter.
  // Depending on the workload, this can reduce the number of read-IOP
  // cost for scans when a prefix is passed via ReadOptions to
  // db.NewIterator().  For prefix filtering to work properly,
  // "prefix_extractor" and "comparator" must be such that the following
  // properties hold:
  //
  // 1) key.starts_with(prefix(key))
  // 2) Compare(prefix(key), key) <= 0.
  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
  // 4) prefix(prefix(key)) == prefix(key)
  //
  // Default: nullptr
265
  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
Tyler Harter's avatar
Tyler Harter committed
266

267
268
269
270
  // Control maximum total data size for a level.
  // max_bytes_for_level_base is the max total for level-1.
  // Maximum number of bytes for level L can be calculated as
  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
sdong's avatar
sdong committed
271
  // For example, if max_bytes_for_level_base is 200MB, and if
272
  // max_bytes_for_level_multiplier is 10, total data size for level-1
273
274
  // will be 200MB, total file size for level-2 will be 2GB,
  // and total file size for level-3 will be 20GB.
Lei Jin's avatar
Lei Jin committed
275
  //
sdong's avatar
sdong committed
276
  // Default: 256MB.
Lei Jin's avatar
Lei Jin committed
277
278
  //
  // Dynamically changeable through SetOptions() API
279
  uint64_t max_bytes_for_level_base = 256 * 1048576;
280

281
  // Deprecated.
282
  uint64_t snap_refresh_nanos = 0;
283

284
285
  // Disable automatic compactions. Manual compactions can still
  // be issued on this column family
Lei Jin's avatar
Lei Jin committed
286
287
  //
  // Dynamically changeable through SetOptions() API
288
  bool disable_auto_compactions = false;
289
290

  // This is a factory that provides TableFactory objects.
291
292
293
  // Default: a block-based table factory that provides a default
  // implementation of TableBuilder and TableReader with default
  // BlockBasedTableOptions.
294
295
  std::shared_ptr<TableFactory> table_factory;

296
297
298
299
300
301
302
  // A list of paths where SST files for this column family
  // can be put into, with its target size. Similar to db_paths,
  // newer data is placed into paths specified earlier in the
  // vector while older data gradually moves to paths specified
  // later in the vector.
  // Note that, if a path is supplied to multiple column
  // families, it would have files and total size from all
303
  // the column families combined. User should provision for the
304
305
306
307
308
309
  // total size(from all the column families) in such cases.
  //
  // If left empty, db_paths will be used.
  // Default: empty
  std::vector<DbPath> cf_paths;

310
  // Compaction concurrent thread limiter for the column family.
311
  // If non-nullptr, use given concurrent thread limiter to control
312
313
314
315
316
317
  // the max outstanding compaction tasks. Limiter can be shared with
  // multiple column families across db instances.
  //
  // Default: nullptr
  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;

318
319
  // Create ColumnFamilyOptions with default values for all fields
  ColumnFamilyOptions();
320
321
  // Create ColumnFamilyOptions from Options
  explicit ColumnFamilyOptions(const Options& options);
322
323

  void Dump(Logger* log) const;
324
325
};

326
327
328
enum class WALRecoveryMode : char {
  // Original levelDB recovery
  // We tolerate incomplete record in trailing data on all logs
329
  // Use case : This is legacy behavior
330
331
332
333
334
335
  kTolerateCorruptedTailRecords = 0x00,
  // Recover from clean shutdown
  // We don't expect to find any corruption in the WAL
  // Use case : This is ideal for unit tests and rare applications that
  // can require high consistency guarantee
  kAbsoluteConsistency = 0x01,
336
  // Recover to point-in-time consistency (default)
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
  // We stop the WAL playback on discovering WAL inconsistency
  // Use case : Ideal for systems that have disk controller cache like
  // hard disk, SSD without super capacitor that store related data
  kPointInTimeRecovery = 0x02,
  // Recovery after a disaster
  // We ignore any corruption in the WAL and try to salvage as much data as
  // possible
  // Use case : Ideal for last ditch effort to recover data or systems that
  // operate with low grade unrelated data
  kSkipAnyCorruptedRecords = 0x03,
};

struct DbPath {
  std::string path;
  uint64_t target_size;  // Target size of total files under the path, in byte.

  DbPath() : target_size(0) {}
  DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
};

357
struct DBOptions {
sdong's avatar
sdong committed
358
359
360
361
  // The function recovers options to the option as in version 4.6.
  DBOptions* OldDefaults(int rocksdb_major_version = 4,
                         int rocksdb_minor_version = 6);

362
363
  // Some functions that make it easier to optimize RocksDB

364
365
  // Use this if your DB is very small (like under 1GB) and you don't want to
  // spend lots of memory for memtables.
366
367
368
  // An optional cache object is passed in for the memory of the
  // memtable to cost to
  DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
369

370
#ifndef ROCKSDB_LITE
371
372
373
374
375
376
  // By default, RocksDB uses only one background thread for flush and
  // compaction. Calling this function will set it up such that total of
  // `total_threads` is used. Good value for `total_threads` is the number of
  // cores. You almost definitely want to call this function if your system is
  // bottlenecked by RocksDB.
  DBOptions* IncreaseParallelism(int total_threads = 16);
377
#endif  // ROCKSDB_LITE
378

379
380
  // If true, the database will be created if it is missing.
  // Default: false
381
  bool create_if_missing = false;
382

Igor Canadi's avatar
Igor Canadi committed
383
384
  // If true, missing column families will be automatically created.
  // Default: false
385
  bool create_missing_column_families = false;
Igor Canadi's avatar
Igor Canadi committed
386

387
388
  // If true, an error is raised if the database already exists.
  // Default: false
389
  bool error_if_exists = false;
390

Igor Canadi's avatar
Igor Canadi committed
391
392
393
  // If true, RocksDB will aggressively check consistency of the data.
  // Also, if any of the  writes to the database fails (Put, Delete, Merge,
  // Write), the database will switch to read-only mode and fail all other
394
  // Write operations.
Igor Canadi's avatar
Igor Canadi committed
395
  // In most cases you want this to be set to true.
396
  // Default: true
397
  bool paranoid_checks = true;
398
399

  // Use the specified object to interact with the environment,
400
401
402
  // e.g. to read/write files, schedule background work, etc. In the near
  // future, support for doing storage operations such as read/write files
  // through env will be deprecated in favor of file_system (see below)
403
  // Default: Env::Default()
404
  Env* env = Env::Default();
405

Lei Jin's avatar
Lei Jin committed
406
407
  // Use to control write rate of flush and compaction. Flush has higher
  // priority than compaction. Rate limiting is disabled if nullptr.
408
  // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
Lei Jin's avatar
Lei Jin committed
409
  // Default: nullptr
410
  std::shared_ptr<RateLimiter> rate_limiter = nullptr;
Lei Jin's avatar
Lei Jin committed
411

412
413
414
415
416
417
418
419
420
421
422
423
424
  // Use to track SST files and control their file deletion rate.
  //
  // Features:
  //  - Throttle the deletion rate of the SST files.
  //  - Keep track the total size of all SST files.
  //  - Set a maximum allowed space limit for SST files that when reached
  //    the DB wont do any further flushes or compactions and will set the
  //    background error.
  //  - Can be shared between multiple dbs.
  // Limitations:
  //  - Only track and throttle deletes of SST files in
  //    first db_path (db_name if db_paths is empty).
  //
425
  // Default: nullptr
426
  std::shared_ptr<SstFileManager> sst_file_manager = nullptr;
427

428
429
430
431
  // Any internal progress/error information generated by the db will
  // be written to info_log if it is non-nullptr, or to a file stored
  // in the same directory as the DB contents if info_log is nullptr.
  // Default: nullptr
432
  std::shared_ptr<Logger> info_log = nullptr;
433

434
#ifdef NDEBUG
435
  InfoLogLevel info_log_level = INFO_LEVEL;
436
#else
437
  InfoLogLevel info_log_level = DEBUG_LEVEL;
438
#endif  // NDEBUG
439

440
  // Number of open files that can be used by the DB.  You may need to
441
442
443
444
  // increase this if your database has a large working set. Value -1 means
  // files opened are always kept open. You can estimate number of files based
  // on target_file_size_base and target_file_size_multiplier for level-based
  // compaction. For universal-style compaction, you can usually set it to -1.
445
  //
sdong's avatar
sdong committed
446
  // Default: -1
447
448
  //
  // Dynamically changeable through SetDBOptions() API.
449
  int max_open_files = -1;
450

451
452
  // If max_open_files is -1, DB will open all files on DB::Open(). You can
  // use this option to increase the number of threads used to open the files.
453
  // Default: 16
454
  int max_file_opening_threads = 16;
455

Igor Canadi's avatar
Igor Canadi committed
456
457
458
459
  // Once write-ahead logs exceed this size, we will start forcing the flush of
  // column families whose memtables are backed by the oldest live WAL file
  // (i.e. the ones that are causing all the space amplification). If set to 0
  // (default), we will dynamically choose the WAL size limit to be
460
  // [sum of all write_buffer_size * max_write_buffer_number] * 4
461
462
  // This option takes effect only when there are more than one column family as
  // otherwise the wal size is dictated by the write_buffer_size.
463
  //
Igor Canadi's avatar
Igor Canadi committed
464
  // Default: 0
465
466
  //
  // Dynamically changeable through SetDBOptions() API.
467
  uint64_t max_total_wal_size = 0;
Igor Canadi's avatar
Igor Canadi committed
468

469
  // If non-null, then we should collect metrics about database operations
470
  std::shared_ptr<Statistics> statistics = nullptr;
471

472
473
  // By default, writes to stable storage use fdatasync (on platforms
  // where this function is available). If this option is true,
Yanqin Jin's avatar
Yanqin Jin committed
474
  // fsync is used instead.
475
476
477
478
479
  //
  // fsync and fdatasync are equally safe for our purposes and fdatasync is
  // faster, so it is rarely necessary to set this option. It is provided
  // as a workaround for kernel/filesystem bugs, such as one that affected
  // fdatasync with ext4 in kernel versions prior to 3.7.
480
  bool use_fsync = false;
481

482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
  // A list of paths where SST files can be put into, with its target size.
  // Newer data is placed into paths specified earlier in the vector while
  // older data gradually moves to paths specified later in the vector.
  //
  // For example, you have a flash device with 10GB allocated for the DB,
  // as well as a hard drive of 2TB, you should config it to be:
  //   [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
  //
  // The system will try to guarantee data under each path is close to but
  // not larger than the target size. But current and future file sizes used
  // by determining where to place a file are based on best-effort estimation,
  // which means there is a chance that the actual size under the directory
  // is slightly more than target size under some workloads. User should give
  // some buffer room for those cases.
  //
  // If none of the paths has sufficient room to place a file, the file will
  // be placed to the last path anyway, despite to the target size.
  //
charsyam's avatar
charsyam committed
500
  // Placing newer data to earlier paths is also best-efforts. User should
501
502
  // expect user files to be placed in higher levels in some extreme cases.
  //
503
504
505
  // If left empty, only one path will be used, which is db_name passed when
  // opening the DB.
  // Default: empty
506
  std::vector<DbPath> db_paths;
507

508
  // This specifies the info LOG dir.
heyongqiang's avatar
heyongqiang committed
509
510
511
512
  // If it is empty, the log files will be in the same dir as data.
  // If it is non empty, the log files will be in the specified dir,
  // and the db data dir's absolute path will be used as the log file
  // name's prefix.
513
  std::string db_log_dir = "";
heyongqiang's avatar
heyongqiang committed
514

515
516
517
518
519
520
  // This specifies the absolute dir path for write-ahead logs (WAL).
  // If it is empty, the log files will be in the same dir as data,
  //   dbname is used as the data dir by default
  // If it is non empty, the log files will be in kept the specified dir.
  // When destroying the db,
  //   all log files in wal_dir and the dir itself is deleted
521
  std::string wal_dir = "";
522

523
  // The periodicity when obsolete files get deleted. The default
Igor Canadi's avatar
Igor Canadi committed
524
525
526
  // value is 6 hours. The files that get out of scope by compaction
  // process will still get automatically delete on every compaction,
  // regardless of this setting
527
528
529
530
  //
  // Default: 6 hours
  //
  // Dynamically changeable through SetDBOptions() API.
531
  uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
Abhishek Kona's avatar
Abhishek Kona committed
532

533
  // Maximum number of concurrent background jobs (compactions and flushes).
534
535
536
537
  //
  // Default: 2
  //
  // Dynamically changeable through SetDBOptions() API.
538
  int max_background_jobs = 2;
539

540
541
  // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
  // value of max_background_jobs. This option is ignored.
542
543
  //
  // Dynamically changeable through SetDBOptions() API.
544
545
546
547
548
549
550
551
  int base_background_compactions = -1;

  // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
  // value of max_background_jobs. For backwards compatibility we will set
  // `max_background_jobs = max_background_compactions + max_background_flushes`
  // in the case where user sets at least one of `max_background_compactions` or
  // `max_background_flushes` (we replace -1 by 1 in case one option is unset).
  //
552
553
  // Maximum number of concurrent background compaction jobs, submitted to
  // the default LOW priority thread pool.
554
  //
555
556
557
  // If you're increasing this, also consider increasing number of threads in
  // LOW priority thread pool. For more information, see
  // Env::SetBackgroundThreads
558
  //
559
  // Default: -1
560
561
  //
  // Dynamically changeable through SetDBOptions() API.
562
  int max_background_compactions = -1;
563

564
  // This value represents the maximum number of threads that will
565
566
567
  // concurrently perform a compaction job by breaking it into multiple,
  // smaller ones that are run simultaneously.
  // Default: 1 (i.e. no subcompactions)
568
  uint32_t max_subcompactions = 1;
569

570
571
572
573
574
575
576
577
578
579
  // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
  // value of max_background_jobs. For backwards compatibility we will set
  // `max_background_jobs = max_background_compactions + max_background_flushes`
  // in the case where user sets at least one of `max_background_compactions` or
  // `max_background_flushes`.
  //
  // Maximum number of concurrent background memtable flush jobs, submitted by
  // default to the HIGH priority thread pool. If the HIGH priority thread pool
  // is configured to have zero threads, flush jobs will share the LOW priority
  // thread pool with compaction jobs.
580
  //
581
582
583
584
  // It is important to use both thread pools when the same Env is shared by
  // multiple db instances. Without a separate pool, long running compaction
  // jobs could potentially block memtable flush jobs of other db instances,
  // leading to unnecessary Put stalls.
585
586
587
588
  //
  // If you're increasing this, also consider increasing number of threads in
  // HIGH priority thread pool. For more information, see
  // Env::SetBackgroundThreads
589
590
  // Default: -1
  int max_background_flushes = -1;
591

592
593
594
595
596
  // Specify the maximal size of the info log file. If the log file
  // is larger than `max_log_file_size`, a new info log file will
  // be created.
  // If max_log_file_size == 0, all logs will be written to one
  // log file.
597
  size_t max_log_file_size = 0;
598

Kai Liu's avatar
Kai Liu committed
599
600
601
602
  // Time for the info log file to roll (in seconds).
  // If specified with non-zero value, log file will be rolled
  // if it has been active longer than `log_file_time_to_roll`.
  // Default: 0 (disabled)
603
  // Not supported in ROCKSDB_LITE mode!
604
  size_t log_file_time_to_roll = 0;
Kai Liu's avatar
Kai Liu committed
605
606
607

  // Maximal info log files to be kept.
  // Default: 1000
608
  size_t keep_log_file_num = 1000;
Kai Liu's avatar
Kai Liu committed
609

610
611
612
613
614
615
616
617
  // Recycle log files.
  // If non-zero, we will reuse previously written log files for new
  // logs, overwriting the old data.  The value indicates how many
  // such files we will keep around at any point in time for later
  // use.  This is more efficient because the blocks are already
  // allocated and fdatasync does not need to update the inode after
  // each write.
  // Default: 0
618
  size_t recycle_log_file_num = 0;
619

Abhishek Kona's avatar
Abhishek Kona committed
620
621
  // manifest file is rolled over on reaching this limit.
  // The older manifest file be deleted.
622
623
624
  // The default value is 1GB so that the manifest file can grow, but not
  // reach the limit of storage capacity.
  uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
Abhishek Kona's avatar
Abhishek Kona committed
625

626
  // Number of shards used for table cache.
627
  int table_cache_numshardbits = 6;
628

629
  // NOT SUPPORTED ANYMORE
630
  // int table_cache_remove_scan_count_limit;
631

632
633
634
635
636
637
638
639
  // The following two fields affect how archived logs will be deleted.
  // 1. If both set to 0, logs will be deleted asap and will not get into
  //    the archive.
  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
  //    WAL files will be checked every 10 min and if total size is greater
  //    then WAL_size_limit_MB, they will be deleted starting with the
  //    earliest until size_limit is met. All empty files will be deleted.
  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
640
  //    WAL files will be checked every WAL_ttl_seconds / 2 and those that
641
642
643
  //    are older than WAL_ttl_seconds will be deleted.
  // 4. If both are not 0, WAL files will be checked every 10 min and both
  //    checks will be performed with ttl being first.
644
645
  uint64_t WAL_ttl_seconds = 0;
  uint64_t WAL_size_limit_MB = 0;
646
647
648
649
650

  // Number of bytes to preallocate (via fallocate) the manifest
  // files.  Default is 4mb, which is reasonable to reduce random IO
  // as well as prevent overallocation for mounts that preallocate
  // large amounts of data (such as xfs's allocsize option).
651
  size_t manifest_preallocation_size = 4 * 1024 * 1024;
652

653
  // Allow the OS to mmap file for reading sst tables. Default: false
654
  bool allow_mmap_reads = false;
655

656
657
658
  // Allow the OS to mmap file for writing.
  // DB::SyncWAL() only works if this is set to false.
  // Default: false
659
  bool allow_mmap_writes = false;
660

Aaron Gao's avatar
Aaron Gao committed
661
662
663
664
665
  // Enable direct I/O mode for read/write
  // they may or may not improve performance depending on the use case
  //
  // Files will be opened in "direct I/O" mode
  // which means that data r/w from the disk will not be cached or
yizhu.sun's avatar
yizhu.sun committed
666
  // buffered. The hardware buffer of the devices may however still
Aaron Gao's avatar
Aaron Gao committed
667
668
  // be used. Memory mapped files are not impacted by these parameters.

669
670
  // Use O_DIRECT for user and compaction reads.
  // When true, we also force new_table_reader_for_compaction_inputs to true.
671
  // Default: false
Aaron Gao's avatar
Aaron Gao committed
672
  // Not supported in ROCKSDB_LITE mode!
673
  bool use_direct_reads = false;
674

675
  // Use O_DIRECT for writes in background flush and compactions.
Aaron Gao's avatar
Aaron Gao committed
676
  // Default: false
Aaron Gao's avatar
Aaron Gao committed
677
  // Not supported in ROCKSDB_LITE mode!
678
  bool use_direct_io_for_flush_and_compaction = false;
Aaron Gao's avatar
Aaron Gao committed
679

680
  // If false, fallocate() calls are bypassed
681
  bool allow_fallocate = true;
682

683
  // Disable child process inherit open files. Default: true
684
  bool is_fd_close_on_exec = true;
685

686
  // NOT SUPPORTED ANYMORE -- this options is no longer used
687
  bool skip_log_error_on_recovery = false;
688

689
  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
690
  //
691
  // Default: 600 (10 min)
692
693
  //
  // Dynamically changeable through SetDBOptions() API.
694
  unsigned int stats_dump_period_sec = 600;
695

696
697
698
699
  // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
  // Default: 600
  unsigned int stats_persist_period_sec = 600;

700
701
702
703
704
705
706
707
708
709
710
711
  // If true, automatically persist stats to a hidden column family (column
  // family name: ___rocksdb_stats_history___) every
  // stats_persist_period_sec seconds; otherwise, write to an in-memory
  // struct. User can query through `GetStatsHistory` API.
  // If user attempts to create a column family with the same name on a DB
  // which have previously set persist_stats_to_disk to true, the column family
  // creation will fail, but the hidden column family will survive, as well as
  // the previously persisted statistics.
  // When peristing stats to disk, the stat name will be limited at 100 bytes.
  // Default: false
  bool persist_stats_to_disk = false;

712
713
714
715
716
  // if not zero, periodically take stats snapshots and store in memory, the
  // memory size for stats snapshots is capped at stats_history_buffer_size
  // Default: 1MB
  size_t stats_history_buffer_size = 1024 * 1024;

717
718
719
  // If set true, will hint the underlying file system that the file
  // access pattern is random, when a sst file is opened.
  // Default: true
720
  bool advise_random_on_open = true;
721

722
723
724
725
726
727
728
729
730
731
  // Amount of data to build up in memtables across all column
  // families before writing to disk.
  //
  // This is distinct from write_buffer_size, which enforces a limit
  // for a single memtable.
  //
  // This feature is disabled by default. Specify a non-zero value
  // to enable it.
  //
  // Default: 0 (disabled)
732
  size_t db_write_buffer_size = 0;
733

734
735
736
737
738
739
  // The memory usage of memtable will report to this object. The same object
  // can be passed into multiple DBs and it will track the sum of size of all
  // the DBs. If the total size of all live memtables of all the DBs exceeds
  // a limit, a flush will be triggered in the next DB to which the next write
  // is issued.
  //
740
  // If the object is only passed to one DB, the behavior is the same as
741
742
743
744
745
746
747
  // db_write_buffer_size. When write_buffer_manager is set, the value set will
  // override db_write_buffer_size.
  //
  // This feature is disabled by default. Specify a non-zero value
  // to enable it.
  //
  // Default: null
748
  std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr;
749

750
751
752
  // Specify the file access pattern once a compaction is started.
  // It will be applied to all input files of a compaction.
  // Default: NORMAL
753
  enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
754
  AccessHint access_hint_on_compaction_start = NORMAL;
755

756
757
758
759
760
761
762
763
764
765
  // If true, always create a new file descriptor and new table reader
  // for compaction inputs. Turn this parameter on may introduce extra
  // memory usage in the table reader, if it allocates extra memory
  // for indexes. This will allow file descriptor prefetch options
  // to be set for compaction input files and not to impact file
  // descriptors for the same file used by user queries.
  // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
  // for this mode if using block-based table.
  //
  // Default: false
766
767
  // This flag has no affect on the behavior of compaction and plan to delete
  // in the future.
768
  bool new_table_reader_for_compaction_inputs = false;
769

770
771
772
773
774
775
776
777
  // If non-zero, we perform bigger reads when doing compaction. If you're
  // running RocksDB on spinning disks, you should set this to at least 2MB.
  // That way RocksDB's compaction is doing sequential instead of random reads.
  //
  // When non-zero, we also force new_table_reader_for_compaction_inputs to
  // true.
  //
  // Default: 0
778
779
  //
  // Dynamically changeable through SetDBOptions() API.
780
  size_t compaction_readahead_size = 0;
781

782
783
784
785
786
787
788
789
790
791
792
793
  // This is a maximum buffer size that is used by WinMmapReadableFile in
  // unbuffered disk I/O mode. We need to maintain an aligned buffer for
  // reads. We allow the buffer to grow until the specified value and then
  // for bigger requests allocate one shot buffers. In unbuffered mode we
  // always bypass read-ahead buffer at ReadaheadRandomAccessFile
  // When read-ahead is required we then make use of compaction_readahead_size
  // value and always try to read ahead. With read-ahead we always
  // pre-allocate buffer to the size instead of growing it up to a limit.
  //
  // This option is currently honored only on Windows
  //
  // Default: 1 Mb
794
795
796
  //
  // Special value: 0 - means do not maintain per instance buffer. Allocate
  //                per request buffer and avoid locking.
797
  size_t random_access_max_buffer_size = 1024 * 1024;
798

799
  // This is the maximum buffer size that is used by WritableFileWriter.
Islam AbdelRahman's avatar
Islam AbdelRahman committed
800
  // On Windows, we need to maintain an aligned buffer for writes.
801
802
803
  // We allow the buffer to grow until it's size hits the limit in buffered
  // IO and fix the buffer size when using direct IO to ensure alignment of
  // write requests if the logical sector size is unusual
804
805
  //
  // Default: 1024 * 1024 (1 MB)
806
807
  //
  // Dynamically changeable through SetDBOptions() API.
808
  size_t writable_file_max_buffer_size = 1024 * 1024;
809

810
811
812
813
814
  // Use adaptive mutex, which spins in the user space before resorting
  // to kernel. This could reduce context switch when the mutex is not
  // heavily contended. However, if the mutex is hot, we could end up
  // wasting spin time.
  // Default: false
815
  bool use_adaptive_mutex = false;
816

817
818
  // Create DBOptions with default values for all fields
  DBOptions();
819
820
  // Create DBOptions from Options
  explicit DBOptions(const Options& options);
821
822

  void Dump(Logger* log) const;
823
824

  // Allows OS to incrementally sync files to disk while they are being
825
  // written, asynchronously, in the background. This operation can be used
Edouard A's avatar
Edouard A committed
826
  // to smooth out write I/Os over time. Users shouldn't rely on it for
827
  // persistency guarantee.
828
829
830
831
832
  // Issue one request for every bytes_per_sync written. 0 turns it off.
  //
  // You may consider using rate_limiter to regulate write rate to device.
  // When rate limiter is enabled, it automatically enables bytes_per_sync
  // to 1MB.
833
834
  //
  // This option applies to table files
835
836
837
  //
  // Default: 0, turned off
  //
838
  // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
839
  // Dynamically changeable through SetDBOptions() API.
840
  uint64_t bytes_per_sync = 0;
841

842
  // Same as bytes_per_sync, but applies to WAL files
843
  //
844
  // Default: 0, turned off
845
846
  //
  // Dynamically changeable through SetDBOptions() API.
847
  uint64_t wal_bytes_per_sync = 0;
848

849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
  // When true, guarantees WAL files have at most `wal_bytes_per_sync`
  // bytes submitted for writeback at any given time, and SST files have at most
  // `bytes_per_sync` bytes pending writeback at any given time. This can be
  // used to handle cases where processing speed exceeds I/O speed during file
  // generation, which can lead to a huge sync when the file is finished, even
  // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
  //
  //  - If `sync_file_range` is supported it achieves this by waiting for any
  //    prior `sync_file_range`s to finish before proceeding. In this way,
  //    processing (compression, etc.) can proceed uninhibited in the gap
  //    between `sync_file_range`s, and we block only when I/O falls behind.
  //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
  //    always blocks, thus preventing the interleaving of I/O and processing.
  //
  // Note: Enabling this option does not provide any additional persistence
  // guarantees, as it may use `sync_file_range`, which does not write out
  // metadata.
  //
  // Default: false
  bool strict_bytes_per_sync = false;

870
  // A vector of EventListeners whose callback functions will be called
871
872
873
  // when specific RocksDB event happens.
  std::vector<std::shared_ptr<EventListener>> listeners;

874
875
876
877
  // If true, then the status of the threads involved in this DB will
  // be tracked and available via GetThreadList() API.
  //
  // Default: false
878
  bool enable_thread_tracking = false;
sdong's avatar
sdong committed
879

880
881
882
883
884
885
  // The limited write rate to DB if soft_pending_compaction_bytes_limit or
  // level0_slowdown_writes_trigger is triggered, or we are writing to the
  // last mem table allowed and we allow more than 3 mem tables. It is
  // calculated using size of user write requests before compression.
  // RocksDB may decide to slow down more if the compaction still
  // gets behind further.
886
887
888
889
890
  // If the value is 0, we will infer a value from `rater_limiter` value
  // if it is not empty, or 16MB if `rater_limiter` is empty. Note that
  // if users change the rate in `rate_limiter` after DB is opened,
  // `delayed_write_rate` won't be adjusted.
  //
sdong's avatar
sdong committed
891
892
  // Unit: byte per second.
  //
893
  // Default: 0
894
895
  //
  // Dynamically changeable through SetDBOptions() API.
896
  uint64_t delayed_write_rate = 0;
897

898
899
900
901
902
903
904
  // By default, a single write thread queue is maintained. The thread gets
  // to the head of the queue becomes write batch group leader and responsible
  // for writing to WAL and memtable for the batch group.
  //
  // If enable_pipelined_write is true, separate write thread queue is
  // maintained for WAL write and memtable write. A write thread first enter WAL
  // writer queue and then memtable writer queue. Pending thread on the WAL
hyunwoo's avatar
hyunwoo committed
905
  // writer queue thus only have to wait for previous writers to finish their
906
907
908
909
910
911
912
  // WAL writing but not the memtable writing. Enabling the feature may improve
  // write throughput and reduce latency of the prepare phase of two-phase
  // commit.
  //
  // Default: false
  bool enable_pipelined_write = false;

Maysam Yabandeh's avatar
Maysam Yabandeh committed
913
914
915
916
917
918
  // Setting unordered_write to true trades higher write throughput with
  // relaxing the immutability guarantee of snapshots. This violates the
  // repeatability one expects from ::Get from a snapshot, as well as
  // ::MultiGet and Iterator's consistent-point-in-time view property.
  // If the application cannot tolerate the relaxed guarantees, it can implement
  // its own mechanisms to work around that and yet benefit from the higher
919
920
921
  // throughput. Using TransactionDB with WRITE_PREPARED write policy and
  // two_write_queues=true is one way to achieve immutable snapshots despite
  // unordered_write.
Maysam Yabandeh's avatar
Maysam Yabandeh committed
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
  //
  // By default, i.e., when it is false, rocksdb does not advance the sequence
  // number for new snapshots unless all the writes with lower sequence numbers
  // are already finished. This provides the immutability that we except from
  // snapshots. Moreover, since Iterator and MultiGet internally depend on
  // snapshots, the snapshot immutability results into Iterator and MultiGet
  // offering consistent-point-in-time view. If set to true, although
  // Read-Your-Own-Write property is still provided, the snapshot immutability
  // property is relaxed: the writes issued after the snapshot is obtained (with
  // larger sequence numbers) will be still not visible to the reads from that
  // snapshot, however, there still might be pending writes (with lower sequence
  // number) that will change the state visible to the snapshot after they are
  // landed to the memtable.
  //
  // Default: false
  bool unordered_write = false;

939
940
941
942
943
944
945
  // If true, allow multi-writers to update mem tables in parallel.
  // Only some memtable_factory-s support concurrent writes; currently it
  // is implemented only for SkipListFactory.  Concurrent memtable writes
  // are not compatible with inplace_update_support or filter_deletes.
  // It is strongly recommended to set enable_write_thread_adaptive_yield
  // if you are going to use this feature.
  //
946
  // Default: true
947
  bool allow_concurrent_memtable_write = true;
948
949
950
951
952
953

  // If true, threads synchronizing with the write batch group leader will
  // wait for up to write_thread_max_yield_usec before blocking on a mutex.
  // This can substantially improve throughput for concurrent workloads,
  // regardless of whether allow_concurrent_memtable_write is enabled.
  //
954
  // Default: true
955
  bool enable_write_thread_adaptive_yield = true;
956

957
958
959
960
961
962
963
  // The maximum limit of number of bytes that are written in a single batch
  // of WAL or memtable write. It is followed when the leader write size
  // is larger than 1/8 of this limit.
  //
  // Default: 1 MB
  uint64_t max_write_batch_group_size_bytes = 1 << 20;

964
965
966
967
968
969
970
  // The maximum number of microseconds that a write operation will use
  // a yielding spin loop to coordinate with other write threads before
  // blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
  // set properly) increasing this value is likely to increase RocksDB
  // throughput at the expense of increased CPU usage.
  //
  // Default: 100
971
  uint64_t write_thread_max_yield_usec = 100;
972
973
974
975
976
977
978
979
980

  // The latency in microseconds after which a std::this_thread::yield
  // call (sched_yield on Linux) is considered to be a signal that
  // other processes or threads would like to use the current core.
  // Increasing this makes writer threads more likely to take CPU
  // by spinning, which will show up as an increase in the number of
  // involuntary context switches.
  //
  // Default: 3
981
  uint64_t write_thread_slow_yield_usec = 3;
982

983
984
  // If true, then DB::Open() will not update the statistics used to optimize
  // compaction decision by loading table properties from many files.
jsteemann's avatar
jsteemann committed
985
  // Turning off this feature will improve DBOpen time especially in
986
987
988
  // disk environment.
  //
  // Default: false
989
  bool skip_stats_update_on_db_open = false;
990

991
992
993
994
995
996
997
998
999
1000
  // If true, then DB::Open() will not fetch and check sizes of all sst files.
  // This may significantly speed up startup if there are many sst files,
  // especially when using non-default Env with expensive GetFileSize().
  // We'll still check that all required sst files exist.
  // If paranoid_checks is false, this option is ignored, and sst files are
  // not checked at all.
  //
  // Default: false
  bool skip_checking_sst_file_sizes_on_db_open = false;

For faster browsing, not all history is shown. View entire blame