/**
* @brief Module for creating a suffix tree out of an input text and
* getting important features from it for creating an arc diagram
*
* @file suffixtree.cpp
* @author David Pfahler
* @author Matthias Gusenbauer
*
* @long Module for creating a suffix tree out of an input text and
* getting important features from it for creating an arc diagram
*
* suffix tree adapted but enhanced from:
* Copyright (C) 2012 Eike Send
*
* http://eike.se/nd
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/** Node object constructor for the suffixtree construction
* @constructor
*/
Node = function(){
this.value = ""; /** Node string value */
this.leaves = []; /** Leaves(suffixes) of the node */
this.nodes = []; /** Childnodes of a node */
this.leavePositions = []; /** Startpositions of the leaves in the string */
this.nodePositions = []; /** Startpositions of the childnodes */
};
/** Arc object constructor for maximum matchin pair generation
* @constructor
* @param sourcePos First starting position of the arc
* @param numberEle Length of the arc in characters
* @param targetPos Second starting position of the arc
* @param text String value of the arc
* @return Instance of an Arc object
*/
Arc = function(sourcePos,numberEle,targetPos,text) {
this.sourcePos = sourcePos; /** First starting position of the arc */
this.numberEle = numberEle; /** Length of the arc in characters */
this.targetPos = targetPos; /** Second starting position of the arc */
this.text = text; /** String value of the arc */
};
/** Pair object constructor for pairs in the tree
* @constructor
* @param str the value of pair
* @param pos the starting positions in text
*/
Pair = function(str, pos){
this.pattern = new String(str); /** value of pair */
this.positions = pos; /** starting positions in text */
this.length = str.length; /** length of the text */
};
/** Method to update the position of the pattern of a pair with a new letter
* @param letter the letter that gets concated to the pattern
*/
Pair.prototype.update = function(letter){
this.pattern = this.pattern.concat(letter);
this.length = this.pattern.length;
// meant for multiple pairs per node
for (var i = 0; i < this.positions.length; i++) {
this.positions[i]--;
}
};
/** Method to test for node splitting in Suffixtree generation
* @param suf Current suffix in the tree generation
* @param pos Position of the current suffx in the complete string
* @return boolean whether to split a node or not
*/
Node.prototype.checkNodes = function(suf, pos) {
var node;
for (var i = 0; i < this.nodes.length; i++) {
node = this.nodes[i];
if (node.value == suf[0]) { // if first letter in current node and suffix matches
node.addSuffix(suf.slice(1), pos+1); // split and take new suffix - begin again - MAGIC: node creation here - save position of string
node.nodePositions.push(pos);
return true; // has updated
}
}
return false;
};
/** Method to test leaves for matching beginning and node insertion
* @param suf Current suffix in the tree generation
* @param pos Position of the current suffx in the complete string
* @return none
*/
Node.prototype.checkLeaves = function(suf, pos) {
var node, leaf;
for (var i = 0; i < this.leaves.length; i++) {
leaf = this.leaves[i];
if (leaf[0] == suf[0]) { // if first letter matches leave
node = new Node(); // create new (inner) node - MAGIC: node creation here - save position of string
node.value = leaf[0];
node.addSuffix(suf.slice(1), pos+1); // add suffix of string
var oldPos = this.leavePositions.splice(i, 1)[0];
node.addSuffix(leaf.slice(1), oldPos+1); // add suffix of previous leave
//node.nodePositions.push(pos-1);
node.nodePositions.push(oldPos);
node.nodePositions.push(pos);
this.nodes.push(node); // add new node to current node
this.leaves.splice(i,1);
//this.nodePositions.push(this.leavePositions.splice(i, 1));
return;
}
}
this.leaves.push(suf); // if no suffix match - add new leave node to current node
this.leavePositions.push(pos);
};
/** Method to add a suffix to the current tree
* @param suf Current suffix in the tree generation
* @param pos Position of the current suffx in the complete string
* @return none
*/
Node.prototype.addSuffix = function(suf, pos) {
if (!suf.length) return; // last suffix
if (!this.checkNodes(suf, pos)) { // inner: check all nodes if contains suffix
this.checkLeaves(suf, pos); // outer: if no update in nodes -> check all leavees if contains suffix
}
};
/** Method to retrieve the longest repeated substring from the suffixtree
* @return The longest of the repeated substrings in the suffixtree
*/
Node.prototype.getLongestRepeatedSubString = function() {
var str = "";
var temp = "";
for (var i = 0; i < this.nodes.length; i++) {
temp = this.nodes[i].getLongestRepeatedSubString();
if (temp.length > str.length) {
str = temp;
}
}
return this.value + str;
};
/** Method to retrieve the longest repeated substrings from the suffixtree
* @return All pairs of repeated substrings
*/
Node.prototype.getLongestRepeatedSubStrings = function() {
var pairs = [];
var temp = [];
// Add pair for each position of node
/*for (var i = 0; i < this.nodePositions.length; i++) {
pairs.push(new Pair(this.value, this.nodePositions));
}*/
if (this.nodePositions.length > 0) {
pairs.push(new Pair(this.value, this.nodePositions));
}
// Iterate over subnodes
for (var i = 0; i < this.nodes.length; i++) {
temp = this.nodes[i].getLongestRepeatedSubStrings();
for (var j = 0; j < temp.length; j++) {
if (this.value.length !== 0) { //Not only length check but also check for Definition 1.
temp[j].update(this.value);
}
pairs.push(temp[j]);
}
}
return pairs;
};
/** Method to create an HTML representation of the suffix tree
* @return HTML representation of the suffixtree
*/
Node.prototype.toHTML = function() {
var html = "<div class=node>";
if (this.value.length) {
html += "<h3>"+this.value+"</h3>";
}
if (this.nodes.length) {
html += "<h4>nodes</h4><ul>";
for (var i = 0; i < this.nodes.length; i++) {
html += "<li>" + this.nodes[i].toHTML() + "</li>";
}
html += "</ul>";
}
if (this.leaves.length) {
html += "<h4>leaves</h4><ul>";
for (var i = 0; i < this.leaves.length; i++) {
html += "<li>" + this.leaves[i] + "</li>";
}
html += "</ul>";
}
return html;
};
/** Method to create an HTML representation of the suffix tree
* @constructor
* @return Suffixtree object
*/
SuffixTree = function(str) {
this.node = new Node(); /** Root node of suffixtree */
for (var i = 0; i < str.length; i++) {
this.node.addSuffix(str.slice(i), i);
}
};
/** Method to retrieve all repetitionregions of a tree according to definition 2. from the paper
* @param input The input text string to parse for repetition regions
* @return All repetition regions in the suffixtree
*/
SuffixTree.prototype.getRepRegion = function(input) {
var regions = [];
var n = input.length;
var s = 0;
while (s < n-1)
{
for (var l = 1; l < (n-s)/2+1;l++)
{
var candidate = input.substring(s,s+l);
var end = s+l;
while (input.substring(end,end+l).indexOf(candidate) >= 0)
end += l;
// not enough pairs
if(end === s + l)
continue;
// not left most
var bOk = false;
tuple = [s-1,end-1,candidate.length]
for(var i = 0; i < regions.length && !bOk; i++)
{
var r = regions[i];
if(r[0] === t[0] && r[1] === t[1] && r[2].length === t[2])
bOk = true;
}
if(bOk)
continue;
// not minimal
bOk = false;
for(var i = 0; i < regions.length && !bOk; i++)
{
r = regions[i];
if(r[0] <= s && r[1] >= end && r[2].length <= candidate.length)
bOk = true;
}
if(bOk)
continue;
// perfekt
r = [s,end,candidate];
regions.push(r);
}
s++;
}
return regions;
}
/** Method to retrieve all essential matching pairs of a tree according to definition 3. from the paper
* @param mmPairs The maximal matching pairs of the tree
* @param regions The repetition regions of the tree
* @return All essential matching pairs from the suffix tree
*/
SuffixTree.prototype.getEssentialMatchinPairs = function(mmPairs, regions) {
var emPairs = [];
for (var i = 0; i < mmPairs.length; i++) {
var a = mmPairs[i];
// definition 3.1
// A maximal matching pair not contained in any repetition region
bOk = true;
for (var j = 0; j < regions.length && bOk; j++) {
r = regions[j];
if ((a.sourcePos >= r[0]) &&
(a.targetPos + a.numberEle <= r[1]) ||
// definition 3.2
// Or, a maximal matching pair contained in the same fundamental substring of any repetition region that contains it
(Math.floor((a.sourcePos - r[0])/r[2].length) == Math.floor((a.targetPos + a.numberEle - r[0] - 1)/r[2].length)))
bOk = false;
}
if(bOk)
emPairs.push(a);
}
// definition 3.3
// Or, two consecutive fundamental substrings for a repetition region.
for (var i = regions.length - 1; i >= 0; i--) {
var r = regions[i];
var text = r[2];
var step = text.length;
stop = r[1] - step;
for (var j=r[0]; j < stop; j += step){
emPairs.push(new Arc(j,step,j+step,text));
}
}
return emPairs;
}
/** Method to retrieve all maximal matching pairs of a tree according to definition 1. from the paper
* @return All maximal matching pairs from the tree
*/
SuffixTree.prototype.getMaximalMatchingPairs = function() {
// Definition 1.1 Identical.X and Y consist of the same sequence of symbols.
var pairs = this.node.getLongestRepeatedSubStrings();
// sort the pairs by their string length
pairs.sort(function(a,b){
if( a.length < b.length) return 1;
else if (a.length > b.length) return -1;
else if (a.positions.length < b.positions.length) return 1;
else if (a.positions.length > b.positions.length) return -1;
else return 0; }
);
// and the positions by their occurences
for (i = 0; i < pairs.length; i++) {
pairs[i].positions.sort(function(a,b){return (a - b)});
}
// Definition 1.2 Non-overlapping X and Y do not intersect.
for (i = 0; i < pairs.length; i++) {
var pair = pairs[i];
// eliminate all overlapping within a pattern
for(s = 0; s < pair.positions.length; s++) {
// the next position is not allowed to be within the length of the pattern (overlapping)
for(t = s+1; t < pair.positions.length; t++) {
if(pair.positions[s] + pair.length > pair.positions[t]) {
pair.positions.splice(t--,1);
}
}
}
if(pair.positions.length <= 1)
pairs.splice(i--,1);
}
// create the arcs
var arcs = [];
for (i = 0; i < pairs.length; i++) {
var pair = pairs[i];
for(j = 0; j < pair.positions.length - 1; j++) {
// reverse the text of the pattern (because of internal suffix tree representation)
var text = pair.pattern.split("").reverse().join("")
arcs.push(new Arc(pair.positions[j], pair.length, pair.positions[j+1], text));
}
}
// iterate over the filtered arcs and remove the ones that are not:
// * identical - it is impossible that they are identical here
// * Consecutive
// * Maximal
for(i = 0; i < arcs.length; i++)
{
var arc = arcs[i];
for(j = 0; j < arcs.length; j++)
{
if( i === j )
continue;
var other = arcs[j];
// Not maximal test:
// the start position of this arc is within of an other
// and the end position of this arc is not within this other
if ((arc.sourcePos >= other.sourcePos) &&
(arc.sourcePos + arc.numberEle) <= (other.sourcePos + other.numberEle) &&
(arc.targetPos >= (other.sourcePos + other.numberEle))) {
arcs.splice(i--,1);
break;
}
}
}
return arcs;
};