deterministicGrouping.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. /*
  2. MIT License http://www.opensource.org/licenses/mit-license.php
  3. Author Tobias Koppers @sokra
  4. */
  5. "use strict";
  6. // Simulations show these probabilities for a single change
  7. // 93.1% that one group is invalidated
  8. // 4.8% that two groups are invalidated
  9. // 1.1% that 3 groups are invalidated
  10. // 0.1% that 4 or more groups are invalidated
  11. //
  12. // And these for removing/adding 10 lexically adjacent files
  13. // 64.5% that one group is invalidated
  14. // 24.8% that two groups are invalidated
  15. // 7.8% that 3 groups are invalidated
  16. // 2.7% that 4 or more groups are invalidated
  17. //
  18. // And these for removing/adding 3 random files
  19. // 0% that one group is invalidated
  20. // 3.7% that two groups are invalidated
  21. // 80.8% that 3 groups are invalidated
  22. // 12.3% that 4 groups are invalidated
  23. // 3.2% that 5 or more groups are invalidated
  24. /**
  25. * @param {string} a key
  26. * @param {string} b key
  27. * @returns {number} the similarity as number
  28. */
  29. const similarity = (a, b) => {
  30. const l = Math.min(a.length, b.length);
  31. let dist = 0;
  32. for (let i = 0; i < l; i++) {
  33. const ca = a.charCodeAt(i);
  34. const cb = b.charCodeAt(i);
  35. dist += Math.max(0, 10 - Math.abs(ca - cb));
  36. }
  37. return dist;
  38. };
  39. /**
  40. * @param {string} a key
  41. * @param {string} b key
  42. * @param {Set<string>} usedNames set of already used names
  43. * @returns {string} the common part and a single char for the difference
  44. */
  45. const getName = (a, b, usedNames) => {
  46. const l = Math.min(a.length, b.length);
  47. let i = 0;
  48. while (i < l) {
  49. if (a.charCodeAt(i) !== b.charCodeAt(i)) {
  50. i++;
  51. break;
  52. }
  53. i++;
  54. }
  55. while (i < l) {
  56. const name = a.slice(0, i);
  57. const lowerName = name.toLowerCase();
  58. if (!usedNames.has(lowerName)) {
  59. usedNames.add(lowerName);
  60. return name;
  61. }
  62. i++;
  63. }
  64. // names always contain a hash, so this is always unique
  65. // we don't need to check usedNames nor add it
  66. return a;
  67. };
  68. /**
  69. * @param {Record<string, number>} total total size
  70. * @param {Record<string, number>} size single size
  71. * @returns {void}
  72. */
  73. const addSizeTo = (total, size) => {
  74. for (const key of Object.keys(size)) {
  75. total[key] = (total[key] || 0) + size[key];
  76. }
  77. };
  78. /**
  79. * @param {Record<string, number>} total total size
  80. * @param {Record<string, number>} size single size
  81. * @returns {void}
  82. */
  83. const subtractSizeFrom = (total, size) => {
  84. for (const key of Object.keys(size)) {
  85. total[key] -= size[key];
  86. }
  87. };
  88. /**
  89. * @template T
  90. * @param {Iterable<Node<T>>} nodes some nodes
  91. * @returns {Record<string, number>} total size
  92. */
  93. const sumSize = nodes => {
  94. const sum = Object.create(null);
  95. for (const node of nodes) {
  96. addSizeTo(sum, node.size);
  97. }
  98. return sum;
  99. };
  100. /**
  101. * @param {Record<string, number>} size size
  102. * @param {Record<string, number>} maxSize minimum size
  103. * @returns {boolean} true, when size is too big
  104. */
  105. const isTooBig = (size, maxSize) => {
  106. for (const key of Object.keys(size)) {
  107. const s = size[key];
  108. if (s === 0) continue;
  109. const maxSizeValue = maxSize[key];
  110. if (typeof maxSizeValue === "number" && s > maxSizeValue) return true;
  111. }
  112. return false;
  113. };
  114. /**
  115. * @param {Record<string, number>} size size
  116. * @param {Record<string, number>} minSize minimum size
  117. * @returns {boolean} true, when size is too small
  118. */
  119. const isTooSmall = (size, minSize) => {
  120. for (const key of Object.keys(size)) {
  121. const s = size[key];
  122. if (s === 0) continue;
  123. const minSizeValue = minSize[key];
  124. if (typeof minSizeValue === "number" && s < minSizeValue) return true;
  125. }
  126. return false;
  127. };
  128. /**
  129. * @param {Record<string, number>} size size
  130. * @param {Record<string, number>} minSize minimum size
  131. * @returns {Set<string>} set of types that are too small
  132. */
  133. const getTooSmallTypes = (size, minSize) => {
  134. const types = new Set();
  135. for (const key of Object.keys(size)) {
  136. const s = size[key];
  137. if (s === 0) continue;
  138. const minSizeValue = minSize[key];
  139. if (typeof minSizeValue === "number" && s < minSizeValue) types.add(key);
  140. }
  141. return types;
  142. };
  143. /**
  144. * @template T
  145. * @param {TODO} size size
  146. * @param {Set<string>} types types
  147. * @returns {number} number of matching size types
  148. */
  149. const getNumberOfMatchingSizeTypes = (size, types) => {
  150. let i = 0;
  151. for (const key of Object.keys(size)) {
  152. if (size[key] !== 0 && types.has(key)) i++;
  153. }
  154. return i;
  155. };
  156. /**
  157. * @param {Record<string, number>} size size
  158. * @param {Set<string>} types types
  159. * @returns {number} selective size sum
  160. */
  161. const selectiveSizeSum = (size, types) => {
  162. let sum = 0;
  163. for (const key of Object.keys(size)) {
  164. if (size[key] !== 0 && types.has(key)) sum += size[key];
  165. }
  166. return sum;
  167. };
  168. /**
  169. * @template T
  170. */
  171. class Node {
  172. /**
  173. * @param {T} item item
  174. * @param {string} key key
  175. * @param {Record<string, number>} size size
  176. */
  177. constructor(item, key, size) {
  178. this.item = item;
  179. this.key = key;
  180. this.size = size;
  181. }
  182. }
  183. /**
  184. * @template T
  185. */
  186. class Group {
  187. /**
  188. * @param {Node<T>[]} nodes nodes
  189. * @param {number[] | null} similarities similarities between the nodes (length = nodes.length - 1)
  190. * @param {Record<string, number>=} size size of the group
  191. */
  192. constructor(nodes, similarities, size) {
  193. this.nodes = nodes;
  194. this.similarities = similarities;
  195. this.size = size || sumSize(nodes);
  196. /** @type {string | undefined} */
  197. this.key = undefined;
  198. }
  199. /**
  200. * @param {function(Node<T>): boolean} filter filter function
  201. * @returns {Node<T>[] | undefined} removed nodes
  202. */
  203. popNodes(filter) {
  204. const newNodes = [];
  205. const newSimilarities = [];
  206. const resultNodes = [];
  207. let lastNode;
  208. for (let i = 0; i < this.nodes.length; i++) {
  209. const node = this.nodes[i];
  210. if (filter(node)) {
  211. resultNodes.push(node);
  212. } else {
  213. if (newNodes.length > 0) {
  214. newSimilarities.push(
  215. lastNode === this.nodes[i - 1]
  216. ? /** @type {number[]} */ (this.similarities)[i - 1]
  217. : similarity(/** @type {Node<T>} */ (lastNode).key, node.key)
  218. );
  219. }
  220. newNodes.push(node);
  221. lastNode = node;
  222. }
  223. }
  224. if (resultNodes.length === this.nodes.length) return;
  225. this.nodes = newNodes;
  226. this.similarities = newSimilarities;
  227. this.size = sumSize(newNodes);
  228. return resultNodes;
  229. }
  230. }
  231. /**
  232. * @template T
  233. * @param {Iterable<Node<T>>} nodes nodes
  234. * @returns {number[]} similarities
  235. */
  236. const getSimilarities = nodes => {
  237. // calculate similarities between lexically adjacent nodes
  238. /** @type {number[]} */
  239. const similarities = [];
  240. let last;
  241. for (const node of nodes) {
  242. if (last !== undefined) {
  243. similarities.push(similarity(last.key, node.key));
  244. }
  245. last = node;
  246. }
  247. return similarities;
  248. };
  249. /**
  250. * @template T
  251. * @typedef {object} GroupedItems<T>
  252. * @property {string} key
  253. * @property {T[]} items
  254. * @property {Record<string, number>} size
  255. */
  256. /**
  257. * @template T
  258. * @typedef {object} Options
  259. * @property {Record<string, number>} maxSize maximum size of a group
  260. * @property {Record<string, number>} minSize minimum size of a group (preferred over maximum size)
  261. * @property {Iterable<T>} items a list of items
  262. * @property {function(T): Record<string, number>} getSize function to get size of an item
  263. * @property {function(T): string} getKey function to get the key of an item
  264. */
  265. /**
  266. * @template T
  267. * @param {Options<T>} options options object
  268. * @returns {GroupedItems<T>[]} grouped items
  269. */
  270. module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
  271. /** @type {Group<T>[]} */
  272. const result = [];
  273. const nodes = Array.from(
  274. items,
  275. item => new Node(item, getKey(item), getSize(item))
  276. );
  277. /** @type {Node<T>[]} */
  278. const initialNodes = [];
  279. // lexically ordering of keys
  280. nodes.sort((a, b) => {
  281. if (a.key < b.key) return -1;
  282. if (a.key > b.key) return 1;
  283. return 0;
  284. });
  285. // return nodes bigger than maxSize directly as group
  286. // But make sure that minSize is not violated
  287. for (const node of nodes) {
  288. if (isTooBig(node.size, maxSize) && !isTooSmall(node.size, minSize)) {
  289. result.push(new Group([node], []));
  290. } else {
  291. initialNodes.push(node);
  292. }
  293. }
  294. if (initialNodes.length > 0) {
  295. const initialGroup = new Group(initialNodes, getSimilarities(initialNodes));
  296. /**
  297. * @param {Group<T>} group group
  298. * @param {Record<string, number>} consideredSize size of the group to consider
  299. * @returns {boolean} true, if the group was modified
  300. */
  301. const removeProblematicNodes = (group, consideredSize = group.size) => {
  302. const problemTypes = getTooSmallTypes(consideredSize, minSize);
  303. if (problemTypes.size > 0) {
  304. // We hit an edge case where the working set is already smaller than minSize
  305. // We merge problematic nodes with the smallest result node to keep minSize intact
  306. const problemNodes = group.popNodes(
  307. n => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
  308. );
  309. if (problemNodes === undefined) return false;
  310. // Only merge it with result nodes that have the problematic size type
  311. const possibleResultGroups = result.filter(
  312. n => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
  313. );
  314. if (possibleResultGroups.length > 0) {
  315. const bestGroup = possibleResultGroups.reduce((min, group) => {
  316. const minMatches = getNumberOfMatchingSizeTypes(min, problemTypes);
  317. const groupMatches = getNumberOfMatchingSizeTypes(
  318. group,
  319. problemTypes
  320. );
  321. if (minMatches !== groupMatches)
  322. return minMatches < groupMatches ? group : min;
  323. if (
  324. selectiveSizeSum(min.size, problemTypes) >
  325. selectiveSizeSum(group.size, problemTypes)
  326. )
  327. return group;
  328. return min;
  329. });
  330. for (const node of problemNodes) bestGroup.nodes.push(node);
  331. bestGroup.nodes.sort((a, b) => {
  332. if (a.key < b.key) return -1;
  333. if (a.key > b.key) return 1;
  334. return 0;
  335. });
  336. } else {
  337. // There are no other nodes with the same size types
  338. // We create a new group and have to accept that it's smaller than minSize
  339. result.push(new Group(problemNodes, null));
  340. }
  341. return true;
  342. }
  343. return false;
  344. };
  345. if (initialGroup.nodes.length > 0) {
  346. const queue = [initialGroup];
  347. while (queue.length) {
  348. const group = /** @type {Group<T>} */ (queue.pop());
  349. // only groups bigger than maxSize need to be splitted
  350. if (!isTooBig(group.size, maxSize)) {
  351. result.push(group);
  352. continue;
  353. }
  354. // If the group is already too small
  355. // we try to work only with the unproblematic nodes
  356. if (removeProblematicNodes(group)) {
  357. // This changed something, so we try this group again
  358. queue.push(group);
  359. continue;
  360. }
  361. // find unsplittable area from left and right
  362. // going minSize from left and right
  363. // at least one node need to be included otherwise we get stuck
  364. let left = 1;
  365. const leftSize = Object.create(null);
  366. addSizeTo(leftSize, group.nodes[0].size);
  367. while (left < group.nodes.length && isTooSmall(leftSize, minSize)) {
  368. addSizeTo(leftSize, group.nodes[left].size);
  369. left++;
  370. }
  371. let right = group.nodes.length - 2;
  372. const rightSize = Object.create(null);
  373. addSizeTo(rightSize, group.nodes[group.nodes.length - 1].size);
  374. while (right >= 0 && isTooSmall(rightSize, minSize)) {
  375. addSizeTo(rightSize, group.nodes[right].size);
  376. right--;
  377. }
  378. // left v v right
  379. // [ O O O ] O O O [ O O O ]
  380. // ^^^^^^^^^ leftSize
  381. // rightSize ^^^^^^^^^
  382. // leftSize > minSize
  383. // rightSize > minSize
  384. // Perfect split: [ O O O ] [ O O O ]
  385. // right === left - 1
  386. if (left - 1 > right) {
  387. // We try to remove some problematic nodes to "fix" that
  388. let prevSize;
  389. if (right < group.nodes.length - left) {
  390. subtractSizeFrom(rightSize, group.nodes[right + 1].size);
  391. prevSize = rightSize;
  392. } else {
  393. subtractSizeFrom(leftSize, group.nodes[left - 1].size);
  394. prevSize = leftSize;
  395. }
  396. if (removeProblematicNodes(group, prevSize)) {
  397. // This changed something, so we try this group again
  398. queue.push(group);
  399. continue;
  400. }
  401. // can't split group while holding minSize
  402. // because minSize is preferred of maxSize we return
  403. // the problematic nodes as result here even while it's too big
  404. // To avoid this make sure maxSize > minSize * 3
  405. result.push(group);
  406. continue;
  407. }
  408. if (left <= right) {
  409. // when there is a area between left and right
  410. // we look for best split point
  411. // we split at the minimum similarity
  412. // here key space is separated the most
  413. // But we also need to make sure to not create too small groups
  414. let best = -1;
  415. let bestSimilarity = Infinity;
  416. let pos = left;
  417. const rightSize = sumSize(group.nodes.slice(pos));
  418. // pos v v right
  419. // [ O O O ] O O O [ O O O ]
  420. // ^^^^^^^^^ leftSize
  421. // rightSize ^^^^^^^^^^^^^^^
  422. while (pos <= right + 1) {
  423. const similarity = /** @type {number[]} */ (group.similarities)[
  424. pos - 1
  425. ];
  426. if (
  427. similarity < bestSimilarity &&
  428. !isTooSmall(leftSize, minSize) &&
  429. !isTooSmall(rightSize, minSize)
  430. ) {
  431. best = pos;
  432. bestSimilarity = similarity;
  433. }
  434. addSizeTo(leftSize, group.nodes[pos].size);
  435. subtractSizeFrom(rightSize, group.nodes[pos].size);
  436. pos++;
  437. }
  438. if (best < 0) {
  439. // This can't happen
  440. // but if that assumption is wrong
  441. // fallback to a big group
  442. result.push(group);
  443. continue;
  444. }
  445. left = best;
  446. right = best - 1;
  447. }
  448. // create two new groups for left and right area
  449. // and queue them up
  450. const rightNodes = [group.nodes[right + 1]];
  451. /** @type {number[]} */
  452. const rightSimilarities = [];
  453. for (let i = right + 2; i < group.nodes.length; i++) {
  454. rightSimilarities.push(
  455. /** @type {number[]} */ (group.similarities)[i - 1]
  456. );
  457. rightNodes.push(group.nodes[i]);
  458. }
  459. queue.push(new Group(rightNodes, rightSimilarities));
  460. const leftNodes = [group.nodes[0]];
  461. /** @type {number[]} */
  462. const leftSimilarities = [];
  463. for (let i = 1; i < left; i++) {
  464. leftSimilarities.push(
  465. /** @type {number[]} */ (group.similarities)[i - 1]
  466. );
  467. leftNodes.push(group.nodes[i]);
  468. }
  469. queue.push(new Group(leftNodes, leftSimilarities));
  470. }
  471. }
  472. }
  473. // lexically ordering
  474. result.sort((a, b) => {
  475. if (a.nodes[0].key < b.nodes[0].key) return -1;
  476. if (a.nodes[0].key > b.nodes[0].key) return 1;
  477. return 0;
  478. });
  479. // give every group a name
  480. const usedNames = new Set();
  481. for (let i = 0; i < result.length; i++) {
  482. const group = result[i];
  483. if (group.nodes.length === 1) {
  484. group.key = group.nodes[0].key;
  485. } else {
  486. const first = group.nodes[0];
  487. const last = group.nodes[group.nodes.length - 1];
  488. const name = getName(first.key, last.key, usedNames);
  489. group.key = name;
  490. }
  491. }
  492. // return the results
  493. return result.map(
  494. group =>
  495. /** @type {GroupedItems<T>} */
  496. ({
  497. key: group.key,
  498. items: group.nodes.map(node => node.item),
  499. size: group.size
  500. })
  501. );
  502. };