001: /*-
002: * See the file LICENSE for redistribution information.
003: *
004: * Copyright (c) 2002,2008 Oracle. All rights reserved.
005: *
006: * $Id: SR12885Test.java,v 1.5.2.3 2008/01/07 15:14:25 cwl Exp $
007: */
008:
009: package com.sleepycat.je.cleaner;
010:
011: import java.io.File;
012: import java.io.IOException;
013:
014: import junit.framework.TestCase;
015:
016: import com.sleepycat.je.CheckpointConfig;
017: import com.sleepycat.je.Database;
018: import com.sleepycat.je.DatabaseConfig;
019: import com.sleepycat.je.DatabaseEntry;
020: import com.sleepycat.je.DatabaseException;
021: import com.sleepycat.je.DbInternal;
022: import com.sleepycat.je.Environment;
023: import com.sleepycat.je.EnvironmentConfig;
024: import com.sleepycat.je.OperationStatus;
025: import com.sleepycat.je.Transaction;
026: import com.sleepycat.je.config.EnvironmentParams;
027: import com.sleepycat.je.log.FileManager;
028: import com.sleepycat.je.util.TestUtils;
029:
030: /**
031: * Reproduces a problem found in SR12885 where we failed to migrate a pending
032: * LN if the slot was reused by an active transaction and that transaction was
033: * later aborted.
034: *
035: * This bug can manifest as a LogNotFoundException. However, there was another
036: * bug that caused this bug to manifest sometimes as a NOTFOUND return value.
037: * This secondary problem -- more sloppyness than a real bug -- was that the
038: * PendingDeleted flag was not cleared during an abort. If the PendingDeleted
039: * flag is set, the low level fetch method will return null rather than
040: * throwing a LogFileNotFoundException. This caused a NOTFOUND in some cases.
041: *
042: * The sequence that causes the bug is:
043: *
044: * 1) The cleaner processes a file containing LN-A (node A) for key X. Key X
045: * is a non-deleted LN.
046: *
047: * 2) The cleaner sets the migrate flag on the BIN entry for LN-A.
048: *
049: * 3) In transaction T-1, LN-A is deleted and replaced by LN-B with key X,
050: * reusing the same slot but assigning a new node ID. At this point both node
051: * IDs (LN-A and LN-B) are locked.
052: *
053: * 4) The cleaner (via a checkpoint or eviction that logs the BIN) tries to
054: * migrate LN-B, the current LN in the BIN, but finds it locked. It adds LN-B
055: * to the pending LN list.
056: *
057: * 5) T-1 aborts, putting the LSN of LN-A back into the BIN slot.
058: *
059: * 6) In transaction T-2, LN-A is deleted and replaced by LN-C with key X,
060: * reusing the same slot but assigning a new node ID. At this point both node
061: * IDs (LN-A and LN-C) are locked.
062: *
063: * 7) The cleaner (via a checkpoint or wakeup) processes the pending LN-B. It
064: * first gets a lock on node B, then does the tree lookup. It finds LN-C in
065: * the tree, but it doesn't notice that it has a different node ID than the
066: * node it locked.
067: *
068: * 8) The cleaner sees that LN-C is deleted, and therefore no migration is
069: * necessary -- this is incorrect. It removes LN-B from the pending list,
070: * allowing the cleaned file to be deleted.
071: *
072: * 9) T-2 aborts, putting the LSN of LN-A back into the BIN slot.
073: *
074: * 10) A fetch of key X will fail, since the file containing the LSN for LN-A
075: * has been deleted. If we didn't clear the PendingDeleted flag, this will
076: * cause a NOTFOUND error instead of a LogFileNotFoundException.
077: */
078: public class SR12885Test extends TestCase {
079:
080: private static final String DB_NAME = "foo";
081:
082: private static final CheckpointConfig forceConfig = new CheckpointConfig();
083: static {
084: forceConfig.setForce(true);
085: }
086:
087: private File envHome;
088: private Environment env;
089: private Database db;
090:
091: public SR12885Test() {
092: envHome = new File(System.getProperty(TestUtils.DEST_DIR));
093: }
094:
095: public void setUp() throws IOException, DatabaseException {
096:
097: TestUtils.removeLogFiles("Setup", envHome, false);
098: TestUtils.removeFiles("Setup", envHome, FileManager.DEL_SUFFIX);
099: }
100:
101: public void tearDown() throws IOException, DatabaseException {
102:
103: try {
104: if (env != null) {
105: env.close();
106: }
107: } catch (Throwable e) {
108: System.out.println("tearDown: " + e);
109: }
110:
111: try {
112: TestUtils.removeLogFiles("tearDown", envHome, true);
113: TestUtils.removeFiles("tearDown", envHome,
114: FileManager.DEL_SUFFIX);
115: } catch (Throwable e) {
116: System.out.println("tearDown: " + e);
117: }
118:
119: db = null;
120: env = null;
121: envHome = null;
122: }
123:
124: /**
125: * Opens the environment and database.
126: */
127: private void openEnv() throws DatabaseException {
128:
129: EnvironmentConfig config = TestUtils.initEnvConfig();
130: DbInternal.disableParameterValidation(config);
131: config.setTransactional(true);
132: config.setAllowCreate(true);
133: /* Do not run the daemons. */
134: config.setConfigParam(EnvironmentParams.ENV_RUN_CLEANER
135: .getName(), "false");
136: config.setConfigParam(EnvironmentParams.ENV_RUN_EVICTOR
137: .getName(), "false");
138: config.setConfigParam(EnvironmentParams.ENV_RUN_CHECKPOINTER
139: .getName(), "false");
140: config.setConfigParam(EnvironmentParams.ENV_RUN_INCOMPRESSOR
141: .getName(), "false");
142: /* Use a small log file size to make cleaning more frequent. */
143: config.setConfigParam(EnvironmentParams.LOG_FILE_MAX.getName(),
144: Integer.toString(1024));
145: env = new Environment(envHome, config);
146:
147: openDb();
148: }
149:
150: /**
151: * Opens that database.
152: */
153: private void openDb() throws DatabaseException {
154:
155: DatabaseConfig dbConfig = new DatabaseConfig();
156: dbConfig.setTransactional(true);
157: dbConfig.setAllowCreate(true);
158: db = env.openDatabase(null, DB_NAME, dbConfig);
159: }
160:
161: /**
162: * Closes the environment and database.
163: */
164: private void closeEnv() throws DatabaseException {
165:
166: if (db != null) {
167: db.close();
168: db = null;
169: }
170: if (env != null) {
171: env.close();
172: env = null;
173: }
174: }
175:
176: public void testSR12885() throws DatabaseException {
177:
178: openEnv();
179:
180: final int COUNT = 10;
181: DatabaseEntry key = new DatabaseEntry();
182: DatabaseEntry data = new DatabaseEntry(TestUtils
183: .getTestArray(0));
184: OperationStatus status;
185:
186: /* Add some records, enough to fill a log file. */
187: for (int i = 0; i < COUNT; i += 1) {
188: key.setData(TestUtils.getTestArray(i));
189: status = db.putNoOverwrite(null, key, data);
190: assertEquals(OperationStatus.SUCCESS, status);
191: }
192:
193: /*
194: * Delete all but key 0, so the first file can be cleaned but key 0
195: * will need to be migrated.
196: */
197: for (int i = 1; i < COUNT; i += 1) {
198: key.setData(TestUtils.getTestArray(i));
199: status = db.delete(null, key);
200: assertEquals(OperationStatus.SUCCESS, status);
201: }
202:
203: /*
204: * Checkpoint and clean to set the migrate flag for key 0. This must
205: * be done when key 0 is not locked, so that it will not be put onto
206: * the pending list yet. Below we cause it to be put onto the pending
207: * list with a different node ID.
208: */
209: env.checkpoint(forceConfig);
210: int cleaned = env.cleanLog();
211: assertTrue("cleaned=" + cleaned, cleaned > 0);
212:
213: /*
214: * Using a transaction, delete then insert key 0, reusing the slot.
215: * The insertion assigns a new node ID. Don't abort the transaction
216: * until after the cleaner migration is finished.
217: */
218: Transaction txn = env.beginTransaction(null, null);
219: key.setData(TestUtils.getTestArray(0));
220: status = db.delete(txn, key);
221: assertEquals(OperationStatus.SUCCESS, status);
222: status = db.putNoOverwrite(txn, key, data);
223: assertEquals(OperationStatus.SUCCESS, status);
224:
225: /*
226: * Checkpoint again to perform LN migration. LN migration will not
227: * migrate key 0 because it is locked -- it will be put onto the
228: * pending list. But the LN put on the pending list will be the newly
229: * inserted node, which has a different node ID than the LN that needs
230: * to be migrated -- this is the first condition for the bug.
231: */
232: env.checkpoint(forceConfig);
233:
234: /*
235: * Abort the transaction to revert to the original node ID for key 0.
236: * Then perform a delete with a new transaction. This makes the
237: * current LN for key 0 deleted.
238: */
239: txn.abort();
240: txn = env.beginTransaction(null, null);
241: key.setData(TestUtils.getTestArray(0));
242: status = db.delete(txn, key);
243: assertEquals(OperationStatus.SUCCESS, status);
244:
245: /*
246: * The current state of key 0 is that the BIN contains a deleted LN,
247: * and that LN has a node ID that is different than the one in the
248: * pending LN list. This node is the one that needs to be migrated.
249: *
250: * Perform a checkpoint to cause pending LNs to be processed and then
251: * delete the cleaned file. When we process the pending LN, we'll lock
252: * the pending LN's node ID (the one we inserted and aborted), which is
253: * the wrong node ID. We'll then examine the current LN, find it
254: * deleted, and neglect to migrate the LN that needs to be migrated.
255: * The error is that we don't lock the node ID of the current LN.
256: *
257: * Then abort the delete transaction. That will revert the BIN entry
258: * to the node we failed to migrate. If we then try to fetch key 0,
259: * we'll get LogNotFoundException.
260: */
261: env.checkpoint(forceConfig);
262: txn.abort();
263: status = db.get(null, key, data, null);
264: assertEquals(OperationStatus.SUCCESS, status);
265:
266: /* If we get this far without LogNotFoundException, it's fixed. */
267:
268: closeEnv();
269: }
270: }
|