Git fork
1/*
2 * diff-delta.c: generate a delta between two buffers
3 *
4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5 * http://www.xmailserver.org/xdiff-lib.html
6 *
7 * Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
8 *
9 * This code is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#define DISABLE_SIGN_COMPARE_WARNINGS
15
16#include "git-compat-util.h"
17#include "delta.h"
18
19/* maximum hash entry list for the same hash bucket */
20#define HASH_LIMIT 64
21
22#define RABIN_SHIFT 23
23#define RABIN_WINDOW 16
24
25static const unsigned int T[256] = {
26 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
27 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
28 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
29 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
30 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
31 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
32 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
33 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
34 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
35 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
36 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
37 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
38 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
39 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
40 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
41 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
42 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
43 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
44 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
45 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
46 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
47 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
48 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
49 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
50 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
51 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
52 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
53 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
54 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
55 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
56 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
57 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
58 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
59 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
60 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
61 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
62 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
63 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
64 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
65 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
66 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
67 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
68 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
69};
70
71static const unsigned int U[256] = {
72 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
73 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
74 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
75 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
76 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
77 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
78 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
79 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
80 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
81 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
82 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
83 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
84 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
85 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
86 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
87 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
88 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
89 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
90 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
91 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
92 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
93 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
94 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
95 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
96 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
97 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
98 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
99 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
100 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
101 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
102 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
103 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
104 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
105 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
106 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
107 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
108 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
109 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
110 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
111 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
112 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
113 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
114 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
115};
116
117struct index_entry {
118 const unsigned char *ptr;
119 unsigned int val;
120};
121
122struct unpacked_index_entry {
123 struct index_entry entry;
124 struct unpacked_index_entry *next;
125};
126
127struct delta_index {
128 unsigned long memsize;
129 const void *src_buf;
130 unsigned long src_size;
131 unsigned int hash_mask;
132 struct index_entry *hash[FLEX_ARRAY];
133};
134
135struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
136{
137 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
138 const unsigned char *data, *buffer = buf;
139 struct delta_index *index;
140 struct unpacked_index_entry *entry, **hash;
141 struct index_entry *packed_entry, **packed_hash;
142 void *mem;
143 unsigned long memsize;
144
145 if (!buf || !bufsize)
146 return NULL;
147
148 /* Determine index hash size. Note that indexing skips the
149 first byte to allow for optimizing the Rabin's polynomial
150 initialization in create_delta(). */
151 entries = (bufsize - 1) / RABIN_WINDOW;
152 if (bufsize >= 0xffffffffUL) {
153 /*
154 * Current delta format can't encode offsets into
155 * reference buffer with more than 32 bits.
156 */
157 entries = 0xfffffffeU / RABIN_WINDOW;
158 }
159 hsize = entries / 4;
160 for (i = 4; (1u << i) < hsize; i++);
161 hsize = 1 << i;
162 hmask = hsize - 1;
163
164 /* allocate lookup index */
165 memsize = sizeof(*hash) * hsize +
166 sizeof(*entry) * entries;
167 mem = malloc(memsize);
168 if (!mem)
169 return NULL;
170 hash = mem;
171 mem = hash + hsize;
172 entry = mem;
173
174 memset(hash, 0, hsize * sizeof(*hash));
175
176 /* allocate an array to count hash entries */
177 hash_count = calloc(hsize, sizeof(*hash_count));
178 if (!hash_count) {
179 free(hash);
180 return NULL;
181 }
182
183 /* then populate the index */
184 prev_val = ~0;
185 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
186 data >= buffer;
187 data -= RABIN_WINDOW) {
188 unsigned int val = 0;
189 for (i = 1; i <= RABIN_WINDOW; i++)
190 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
191 if (val == prev_val) {
192 /* keep the lowest of consecutive identical blocks */
193 entry[-1].entry.ptr = data + RABIN_WINDOW;
194 --entries;
195 } else {
196 prev_val = val;
197 i = val & hmask;
198 entry->entry.ptr = data + RABIN_WINDOW;
199 entry->entry.val = val;
200 entry->next = hash[i];
201 hash[i] = entry++;
202 hash_count[i]++;
203 }
204 }
205
206 /*
207 * Determine a limit on the number of entries in the same hash
208 * bucket. This guards us against pathological data sets causing
209 * really bad hash distribution with most entries in the same hash
210 * bucket that would bring us to O(m*n) computing costs (m and n
211 * corresponding to reference and target buffer sizes).
212 *
213 * Make sure none of the hash buckets has more entries than
214 * we're willing to test. Otherwise we cull the entry list
215 * uniformly to still preserve a good repartition across
216 * the reference buffer.
217 */
218 for (i = 0; i < hsize; i++) {
219 int acc;
220
221 if (hash_count[i] <= HASH_LIMIT)
222 continue;
223
224 /* We leave exactly HASH_LIMIT entries in the bucket */
225 entries -= hash_count[i] - HASH_LIMIT;
226
227 entry = hash[i];
228 acc = 0;
229
230 /*
231 * Assume that this loop is gone through exactly
232 * HASH_LIMIT times and is entered and left with
233 * acc==0. So the first statement in the loop
234 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
235 * to the accumulator, and the inner loop consequently
236 * is run (hash_count[i]-HASH_LIMIT) times, removing
237 * one element from the list each time. Since acc
238 * balances out to 0 at the final run, the inner loop
239 * body can't be left with entry==NULL. So we indeed
240 * encounter entry==NULL in the outer loop only.
241 */
242 do {
243 acc += hash_count[i] - HASH_LIMIT;
244 if (acc > 0) {
245 struct unpacked_index_entry *keep = entry;
246 do {
247 entry = entry->next;
248 acc -= HASH_LIMIT;
249 } while (acc > 0);
250 keep->next = entry->next;
251 }
252 entry = entry->next;
253 } while (entry);
254 }
255 free(hash_count);
256
257 /*
258 * Now create the packed index in array form
259 * rather than linked lists.
260 */
261 memsize = sizeof(*index)
262 + sizeof(*packed_hash) * (hsize+1)
263 + sizeof(*packed_entry) * entries;
264 mem = malloc(memsize);
265 if (!mem) {
266 free(hash);
267 return NULL;
268 }
269
270 index = mem;
271 index->memsize = memsize;
272 index->src_buf = buf;
273 index->src_size = bufsize;
274 index->hash_mask = hmask;
275
276 mem = index->hash;
277 packed_hash = mem;
278 mem = packed_hash + (hsize+1);
279 packed_entry = mem;
280
281 for (i = 0; i < hsize; i++) {
282 /*
283 * Coalesce all entries belonging to one linked list
284 * into consecutive array entries.
285 */
286 packed_hash[i] = packed_entry;
287 for (entry = hash[i]; entry; entry = entry->next)
288 *packed_entry++ = entry->entry;
289 }
290
291 /* Sentinel value to indicate the length of the last hash bucket */
292 packed_hash[hsize] = packed_entry;
293
294 assert(packed_entry - (struct index_entry *)mem == entries);
295 free(hash);
296
297 return index;
298}
299
300void free_delta_index(struct delta_index *index)
301{
302 free(index);
303}
304
305unsigned long sizeof_delta_index(struct delta_index *index)
306{
307 if (index)
308 return index->memsize;
309 else
310 return 0;
311}
312
313/*
314 * The maximum size for any opcode sequence, including the initial header
315 * plus Rabin window plus biggest copy.
316 */
317#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
318
319void *
320create_delta(const struct delta_index *index,
321 const void *trg_buf, unsigned long trg_size,
322 unsigned long *delta_size, unsigned long max_size)
323{
324 unsigned int i, val;
325 off_t outpos, moff;
326 size_t l, outsize, msize;
327 int inscnt;
328 const unsigned char *ref_data, *ref_top, *data, *top;
329 unsigned char *out;
330
331 *delta_size = 0;
332
333 if (!trg_buf || !trg_size)
334 return NULL;
335
336 outpos = 0;
337 outsize = 8192;
338 if (max_size && outsize >= max_size)
339 outsize = max_size + MAX_OP_SIZE + 1;
340 out = malloc(outsize);
341 if (!out)
342 return NULL;
343
344 /* store reference buffer size */
345 l = index->src_size;
346 while (l >= 0x80) {
347 out[outpos++] = l | 0x80;
348 l >>= 7;
349 }
350 out[outpos++] = l;
351
352 /* store target buffer size */
353 l = trg_size;
354 while (l >= 0x80) {
355 out[outpos++] = l | 0x80;
356 l >>= 7;
357 }
358 out[outpos++] = l;
359
360 ref_data = index->src_buf;
361 ref_top = ref_data + index->src_size;
362 data = trg_buf;
363 top = (const unsigned char *) trg_buf + trg_size;
364
365 outpos++;
366 val = 0;
367 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
368 out[outpos++] = *data;
369 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
370 }
371 inscnt = i;
372
373 moff = 0;
374 msize = 0;
375 while (data < top) {
376 if (msize < 4096) {
377 struct index_entry *entry;
378 val ^= U[data[-RABIN_WINDOW]];
379 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
380 i = val & index->hash_mask;
381 for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
382 const unsigned char *ref = entry->ptr;
383 const unsigned char *src = data;
384 unsigned int ref_size = ref_top - ref;
385 if (entry->val != val)
386 continue;
387 if (ref_size > top - src)
388 ref_size = top - src;
389 if (ref_size <= msize)
390 break;
391 while (ref_size-- && *src++ == *ref)
392 ref++;
393 if (msize < ref - entry->ptr) {
394 /* this is our best match so far */
395 msize = ref - entry->ptr;
396 moff = entry->ptr - ref_data;
397 if (msize >= 4096) /* good enough */
398 break;
399 }
400 }
401 }
402
403 if (msize < 4) {
404 if (!inscnt)
405 outpos++;
406 out[outpos++] = *data++;
407 inscnt++;
408 if (inscnt == 0x7f) {
409 out[outpos - inscnt - 1] = inscnt;
410 inscnt = 0;
411 }
412 msize = 0;
413 } else {
414 unsigned int left;
415 unsigned char *op;
416
417 if (inscnt) {
418 while (moff && ref_data[moff-1] == data[-1]) {
419 /* we can match one byte back */
420 msize++;
421 moff--;
422 data--;
423 outpos--;
424 if (--inscnt)
425 continue;
426 outpos--; /* remove count slot */
427 inscnt--; /* make it -1 */
428 break;
429 }
430 out[outpos - inscnt - 1] = inscnt;
431 inscnt = 0;
432 }
433
434 /* A copy op is currently limited to 64KB (pack v2) */
435 left = (msize < 0x10000) ? 0 : (msize - 0x10000);
436 msize -= left;
437
438 op = out + outpos++;
439 i = 0x80;
440
441 if (moff & 0x000000ff) {
442 out[outpos++] = moff >> 0;
443 i |= 0x01;
444 }
445 if (moff & 0x0000ff00) {
446 out[outpos++] = moff >> 8;
447 i |= 0x02;
448 }
449 if (moff & 0x00ff0000) {
450 out[outpos++] = moff >> 16;
451 i |= 0x04;
452 }
453 if (moff & 0xff000000) {
454 out[outpos++] = moff >> 24;
455 i |= 0x08;
456 }
457
458 if (msize & 0x00ff) {
459 out[outpos++] = msize >> 0;
460 i |= 0x10;
461 }
462 if (msize & 0xff00) {
463 out[outpos++] = msize >> 8;
464 i |= 0x20;
465 }
466
467 *op = i;
468
469 data += msize;
470 moff += msize;
471 msize = left;
472
473 if (moff > 0xffffffff)
474 msize = 0;
475
476 if (msize < 4096) {
477 int j;
478 val = 0;
479 for (j = -RABIN_WINDOW; j < 0; j++)
480 val = ((val << 8) | data[j])
481 ^ T[val >> RABIN_SHIFT];
482 }
483 }
484
485 if (outpos >= outsize - MAX_OP_SIZE) {
486 void *tmp = out;
487 outsize = outsize * 3 / 2;
488 if (max_size && outsize >= max_size)
489 outsize = max_size + MAX_OP_SIZE + 1;
490 if (max_size && outpos > max_size)
491 break;
492 out = realloc(out, outsize);
493 if (!out) {
494 free(tmp);
495 return NULL;
496 }
497 }
498 }
499
500 if (inscnt)
501 out[outpos - inscnt - 1] = inscnt;
502
503 if (max_size && outpos > max_size) {
504 free(out);
505 return NULL;
506 }
507
508 *delta_size = outpos;
509 return out;
510}