/**
* contains the loaded text from a file or a Wikipedia article
*/
var globalText;
/**
* contains the words of the globalText
*/
var globalTextArray;
/**
* contains a set of frequent English words which don't carry much information in a phrase-net
*/
var stopwords = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also",
"although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway",
"anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
"behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant",
"co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either",
"eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few",
"fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full",
"further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
"hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it",
"its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill",
"mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless",
"next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only",
"onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put",
"rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere",
"six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten",
"than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon",
"these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together",
"too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well",
"were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
"whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet",
"you", "your", "yours", "yourself", "yourselves", "the"];
/**
* generates a graph from the globalTextArray and the specified phrase from the main interface
*
* @param {string} reg_all A string containing the regular expression used to build the phrase net
*
* @returns A graph object containing nodes and links
*/
function parseText(reg_all)
{
//////////////////////////////////////////////////////////////////
console.log("-----------------------" + "\n" +
"| PERFORMANCE LOGGING |" + "\n" +
"-----------------------" + "\n" +
"t := length of text" + "\n" +
"n := number of displayed nodes/links" + "\n" +
"-----------------------");
var time_prev;
var time_curr;
time_prev = new Date().getTime();
//////////////////////////////////////////////////////////////////
/////////////// EXTRACT NODES AND LINKS ///////////////
// use hash maps (simple JavaScript objects) to store the nodes and links obtained from parsing the text
var simpleNodes = {};
var simpleLinks = {};
// check for regex and add nodes and links to has maps
while ((match = reg_all.exec(globalText)) != null) {
wordA = match[1].toLowerCase();
wordB = match[2].toLowerCase();
// check for first word/node
if (simpleNodes[wordA] == null) {
simpleNodes[wordA] = {name: wordA, inDegree: 0, outDegree: 1, count: 0}; // count will be determined later
} else {
simpleNodes[wordA].outDegree++;
}
// check for second word/node
if (simpleNodes[wordB] == null) {
simpleNodes[wordB] = {name: wordB, inDegree: 1, outDegree: 0, count: 0}; // count will be determined later
} else {
simpleNodes[wordB].inDegree++;
}
// check for link between first and second word/node
// use the concatenated words as the link's id (unique and can be used for fast hash-map look-up)
var link_id = wordA + "_" + wordB;
if (simpleLinks[link_id] == null) {
simpleLinks[link_id] = {source: simpleNodes[wordA], target: simpleNodes[wordB], count: 1};
} else {
simpleLinks[link_id].count++;
}
}
//////////////////////////////////////////////////////////////////
time_curr = new Date().getTime();
console.log("EXTRACT NODES AND LINKS:" + "\n" + (time_curr - time_prev) + "ms ~ O(t)");
time_prev = time_curr;
//////////////////////////////////////////////////////////////////
/////////////// FILTER NODES AND LINKS ///////////////
///// filter nodes (and links) by stop-words /////
// filter nodes by stop words
for (var key in simpleNodes) {
// skip loop if the property is from prototype
if(!simpleNodes.hasOwnProperty(key)) continue;
var node = simpleNodes[key];
// check if node is a stop word
if (stopwords.indexOf(node.name) != -1)
delete simpleNodes[key];
}
// filter links by stop words
for (var key in simpleLinks) {
// skip loop if the property is from prototype
if(!simpleLinks.hasOwnProperty(key)) continue;
var link = simpleLinks[key];
// check if node is a stop word
if (stopwords.indexOf(link.source.name) != -1 || stopwords.indexOf(link.target.name) != -1)
delete simpleLinks[key];
}
///// filter links by count /////
// get links-array
var simpleLinksArray = Object.keys(simpleLinks).map(function (key) {return simpleLinks[key];});
// sort links according to count
simpleLinksArray.sort(function(link1, link2) { return link2.count - link1.count; });
// only select the top links with highest count
var simpleLinksArrayFiltered = simpleLinksArray.slice(0, document.getElementById("input_maximumConnections").value);
///// filter nodes by selected links /////
// create map used for faster accessing of nodes (hash map vs. list) --> will be used for counting nodes in text
var simpleNodesFilteredMap = {};
simpleLinksArrayFiltered.forEach(function(link) {
if (simpleNodesFilteredMap[link.source.name] == null)
simpleNodesFilteredMap[link.source.name] = link.source;
if (simpleNodesFilteredMap[link.target.name] == null)
simpleNodesFilteredMap[link.target.name] = link.target;
});
// also save filtered nodes as array
simpleNodesArrayFiltered = Object.keys(simpleNodesFilteredMap).map(function (key) {return simpleNodesFilteredMap[key];});
//////////////////////////////////////////////////////////////////
time_curr = new Date().getTime();
console.log("FILTER NODES AND LINKS:" + "\n" + (time_curr - time_prev) + "ms ~ O(t)");
time_prev = time_curr;
//////////////////////////////////////////////////////////////////
/////////////// DETERMINE EACH WORD-COUNT ///////////////
for (var i = 0; i < globalTextArray.length; i++) {
var word = globalTextArray[i];
if (simpleNodesFilteredMap[word] != null)
simpleNodesFilteredMap[word].count++;
}
//////////////////////////////////////////////////////////////////
time_curr = new Date().getTime();
console.log("DETERMINE EACH WORD-COUNT:" + "\n" + (time_curr - time_prev) + "ms ~ O(t)");
time_prev = time_curr;
//////////////////////////////////////////////////////////////////
/////////////// MERGE NODES AND LINKS ACCORDING TO TOPOLOGY AND BRING THEM INTO STRUCTURES TO BE VISUALIZED BY THE MAPPER ///////////////
///// merge nodes /////
// create neighborhood map for filtered nodes
var neighborhood = {};
simpleLinksArrayFiltered.forEach(function(link) {
if (neighborhood[link.source.name] == null) {
neighborhood[link.source.name] = {inVertices: [], outVertices: [link.target.name]};
} else {
neighborhood[link.source.name].outVertices.push(link.target.name);
}
if (neighborhood[link.target.name] == null) {
neighborhood[link.target.name] = {inVertices: [link.source.name], outVertices: []};
} else {
neighborhood[link.target.name].inVertices.push(link.source.name);
}
});
var nodes_mergeStep1 = [];
// merge nodes which form a complete sub-graph (representation as a loop)
for (var i = 0; i < simpleNodesArrayFiltered.length; i++) {
var node_1 = simpleNodesArrayFiltered[i];
var node_1_NH = neighborhood[node_1.name];
// check if node has already been visited and ignore it if so
if (node_1.visited == true) continue;
var mergedNode = {id: "node_" + i, subnodes: [{name: node_1.name, inDegree: node_1.inDegree, outDegree: node_1.outDegree, count: node_1.count}]};
// change all links to point to the merged node instead of the old one
simpleLinksArrayFiltered.forEach(function(link) {
if (link.source.name == node_1.name) {
link.source = mergedNode;
}
if (link.target.name == node_1.name) {
link.target = mergedNode;
}
});
for (var j = i+1; j < simpleNodesArrayFiltered.length; j++) {
var node_2 = simpleNodesArrayFiltered[j];
var node_2_NH = neighborhood[node_2.name];
var checkForOccurence = function(neighborhood, nodeName) {
return neighborhood.inVertices.findIndex(function(name) {
return name == nodeName;
}) != -1 &&
neighborhood.outVertices.findIndex(function(name) {
return name == nodeName;
}) != -1;
}
// check if node_1 and node_2 might be in the same complete sub graph
if (checkForOccurence(node_1_NH, node_2.name) && checkForOccurence(node_2_NH, node_1.name)) {
// create copied neighborhood of node_2, where node_1 is removed and node_2 is added
var node_2_NH_compare = {};
node_2_NH_compare.inVertices = [node_2.name];
node_2_NH_compare.outVertices = [node_2.name];
for (var k = 0; k < node_2_NH.inVertices.length; k++) {
if (node_2_NH.inVertices[k] != node_1.name && node_2_NH_compare.inVertices.indexOf(node_2_NH.inVertices[k]) == -1) {
node_2_NH_compare.inVertices.push(node_2_NH.inVertices[k]);
}
}
for (var k = 0; k < node_2_NH.outVertices.length; k++) {
if (node_2_NH.outVertices[k] != node_1.name && node_2_NH_compare.outVertices.indexOf(node_2_NH.outVertices[k]) == -1) {
node_2_NH_compare.outVertices.push(node_2_NH.outVertices[k]);
}
}
if (compareNeighborhoods(node_1_NH, node_2_NH_compare) == true) {
// mark node as visited
node_2.visited = true;
// merge the nodes
mergedNode.subnodes.push({name: node_2.name, inDegree: node_2.inDegree, outDegree: node_2.outDegree, count: node_2.count});
// change all links to point to the merged node instead of the old one
simpleLinksArrayFiltered.forEach(function(link) {
if (link.source.name == node_2.name) {
link.source = mergedNode;
}
if (link.target.name == node_2.name) {
link.target = mergedNode;
}
});
}
}
}
nodes_mergeStep1.push(mergedNode);
}
// create neighborhood map for new merged nodes
var neighborhood_new = {};
simpleLinksArrayFiltered.forEach(function(link) {
if (neighborhood_new[link.source.id] == null) {
neighborhood_new[link.source.id] = {inVertices: [], outVertices: [link.target.id]};
} else {
neighborhood_new[link.source.id].outVertices.push(link.target.id);
}
if (neighborhood_new[link.target.id] == null) {
neighborhood_new[link.target.id] = {inVertices: [link.source.id], outVertices: []};
} else {
neighborhood_new[link.target.id].inVertices.push(link.source.id);
}
});
var nodes_mergeStep2 = [];
// merge nodes which have the same neighborhood
for (var i = 0; i < nodes_mergeStep1.length; i++) {
var node_1 = nodes_mergeStep1[i];
// check if node has already been visited and ignore it if so
if (node_1.visited == true) continue;
var mergedNode = {id: "node_" + i, subnodes: node_1.subnodes};
// change all links to point to the merged node instead of the old one
simpleLinksArrayFiltered.forEach(function(link) {
if (link.source.id == node_1.id) {
link.source = mergedNode;
}
if (link.target.id == node_1.id) {
link.target = mergedNode;
}
});
for (var j = i+1; j < nodes_mergeStep1.length; j++) {
var node_2 = nodes_mergeStep1[j];
// check for same neighborhoods (and check if the node has no self loop --> otherwise a complete graph where each node has a self loop would be merged)
if (compareNeighborhoods(neighborhood_new[node_1.id], neighborhood_new[node_2.id]) == true && neighborhood_new[node_1.id].inVertices.indexOf(node_1.id) == -1 ) {
// mark node as visited
node_2.visited = true;
// merge the nodes
mergedNode.subnodes = mergedNode.subnodes.concat(node_2.subnodes);
// change all links to point to the merged node instead of the old one
simpleLinksArrayFiltered.forEach(function(link) {
if (link.source.id == node_2.id) {
link.source = mergedNode;
}
if (link.target.id == node_2.id) {
link.target = mergedNode;
}
});
}
}
nodes_mergeStep2.push(mergedNode);
}
///// merge links /////
// remove all links which don't point to a merged node (shouldn't exist!!!)
// iterate backwards to avoid problems when removing links
for (var i = simpleLinksArrayFiltered.length - 1; i >= 0; i--) {
// check if there exists no id-value (only present in merged nodes)
if (simpleLinksArrayFiltered[i].source.id == null || simpleLinksArrayFiltered[i].target.id == null) {
console.log("The following link doesn't point to a merged node!");
console.log(simpleLinksArrayFiltered[i]);
// remove link
simpleLinksArrayFiltered.splice(i, 1);
}
}
var links = [];
// merge all links with same source and target
for (var i = 0; i < simpleLinksArrayFiltered.length; i++) {
var link_1 = simpleLinksArrayFiltered[i];
// check if link has already been visited and ignore it if so
if (link_1.visited == true) continue;
var mergedLink = {source: link_1.source, target: link_1.target, count: link_1.count};
var linkCounter = 1;
for (var j = i+1; j < simpleLinksArrayFiltered.length; j++) {
var link_2 = simpleLinksArrayFiltered[j];
// check for same source and target
if (link_1.source.id == link_2.source.id && link_1.target.id == link_2.target.id) {
// mark link as visited
link_2.visited = true;
// increase linkCounter
linkCounter++;
// update count of merged link
mergedLink.count += link_2.count
}
}
// calculate the average count of the merged links
mergedLink.count /= linkCounter;
links.push(mergedLink);
}
//////////////////////////////////////////////////////////////////
time_curr = new Date().getTime();
console.log("MERGE NODES AND LINKS ACCORDING TO TOPOLOGY:" + "\n" + (time_curr - time_prev) + "ms ~ O(n^2)");
time_prev = time_curr;
//////////////////////////////////////////////////////////////////
/////////////// CREATE GRAPH CONTAINER ///////////////
var graph = {"nodes" : nodes_mergeStep2, "links" : links};
return graph;
}
/**
* compares two neighborhoods and returns true if they contain the same in-node and out-nodes
*
* @param {object} n1 The first neighborhood
* @param {object} n2 The second neighborhood
*
* @returns true if n1 and n2 contain the same in-node and out-nodes
*/
function compareNeighborhoods(n1, n2) {
return (compareArrays(n1.inVertices, n2.inVertices) && compareArrays(n1.outVertices, n2.outVertices));
}
/**
* compares two arrays of strings and returns true if they contain the same elements
*
* @param {array} a The first array
* @param {array} b The second array
*
* @returns true if a and b contain the same elements
*/
function compareArrays(a, b) {
if (a.sort().join(',') == b.sort().join(',')) {
return true;
}
return false;
}