/**
* Because many sites generate multiple URLs or URL variants pointing to the
* same resource, we standardize user-submitted URLs before storing or querying
* them. This reduces the need for manual merging.
*
* This module also can tag some known URLs according to predefined tags like
* 'shops' and 'reviews'.
*
* @namespace URLUtils
*/
'use strict';
// Node's built-in module helps a little
const url = require('url');
const urlRegex = /^(https?|ftp):\/\/(((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!$&'()*+,;=]|:)*@)?(((\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5]))|((([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.)+(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.?)(:\d*)?)(\/((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!$&'()*+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!$&'()*+,;=]|:|@)*)*)?)?(\?((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!$&'()*+,;=]|:|@)|[\uE000-\uF8FF]|\/|\?)*)?(#((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!$&'()*+,;=]|:|@)|\/|\?)*)*$/i;
// host: match against the hostname part of the URL
// converter: conversion to be applied to a URL of this type before it is added
// to the database. Can be an array of multiple functions run in sequence.
// tags: internal descriptors that identify this URL, which can be used to group
// related URLs. For applications that only use one tag, the first tag in the
// array is chosen.
// id: identifier for all URLs of this type, which can be used to create
// a default non-alphabetic presentation order all known URLs of a certain type
const rules = [{
host: /^(www\.)?amazon\.com$/,
converter: _stripAmazonQueryStrings,
tags: ['shops', 'reviews'],
id: 'amazon'
},
{
host: /^(www\.)?wikidata\.org$/,
tags: ['databases', 'opendata'],
id: 'wikidata',
converter: _stripFragment
},
{
host: /^(www\.)?goodreads\.com$/,
tags: ['reviews', 'databases'],
id: 'goodreads'
},
{
host: /^(www\.)?openstreetmap\.org$/,
tags: ['maps', 'opendata', 'databases'],
id: 'openstreetmap'
},
{
host: /^openlibrary\.org$/,
tags: ['databases', 'opendata'],
id: 'openlibrary',
converter: _stripOpenLibraryTitleSuffix
},
{
host: /^(www\.)?imdb\.com$/,
tags: ['databases', 'reviews'],
id: 'imdb'
},
{
host: /^(www\.)?yelp\.com$/,
tags: ['reviews', 'databases'],
id: 'yelp'
},
{
host: /^(www\.)?tripadvisor\.com$/,
tags: ['reviews', 'databases'],
id: 'tripadvisor'
},
{
host: /^(www\.)?indiebound\.org$/,
tags: ['shops'],
id: 'indiebound'
},
{
host: /^([a-z]*)?wikipedia\.org$/,
tags: ['summaries', 'databases', 'opendata'],
id: 'wikipedia'
},
{
host: /^store\.steampowered\.com$/,
tags: ['shops', 'reviews'],
id: 'steam'
},
{
host: /^(.*\.)?itch\.io$/,
tags: ['shops'],
id: 'itch'
},
{
host: /^(www\.)?gog\.com$/,
tags: ['shops'],
id: 'gog'
},
];
// Preferred order. We generally rank open data before proprietary data, and
// nonprofit platforms before for-profit ones.
const placement = {
databases: ['wikidata', 'imdb'],
maps: ['openstreetmap'],
reviews: ['yelp', 'tripadvisor', 'goodreads'],
shops: ['indiebound', 'itch', 'gog', 'steam', 'amazon'],
summaries: ['wikipedia']
};
let urlUtils = {
validate(inputURL) {
return urlRegex.test(inputURL);
},
// Apply all relevant converters. Since the URL is parsed via url.parse,
// special characters are also urlencoded.
normalize(inputURL) {
let outputURL;
let parsedURL = url.parse(inputURL);
// Normalizes trailing slashes
outputURL = parsedURL.href;
const runAll = (converters, url) => {
converters.forEach(converter => (url = converter(url)));
return url;
};
for (let rule of rules) {
if (rule.converter && rule.host.test(parsedURL.hostname)) {
if (Array.isArray(rule.converter))
outputURL = runAll(rule.converter, outputURL);
else
outputURL = rule.converter(outputURL);
}
}
return outputURL;
},
// Transforms an URL array into a key value object as follows:
//
// {
// databases: [{ id: 'wikidata', url: 'https://www.wikidata.org/wiki/Q2611788' }],
// reviews: [{ id: 'yelp', url: 'https://www.yelp.com/biz/katzs-delicatessen-new-york'],
// opendata: [{ id: 'wikidata', url: 'https://www.wikidata.org/wiki/Q2611788' }],
// other: ['https://www.katzsdelicatessen.com/']
// }
//
// If onlyOneTag is set to true, the first tag from the rules is applied.
// If sortResults is set to true, the placement rules are applied.
getURLsByTag(inputURLs = [], options = { onlyOneTag: false, sortResults: false }) {
const { onlyOneTag, sortResults } = options;
const rv = {};
for (let inputURL of inputURLs) {
let recognized = false;
let parsedURL = url.parse(inputURL);
for (let rule of rules) {
if (rule.host.test(parsedURL.hostname) && rule.tags && rule.id) {
for (let tag of rule.tags) {
if (rv[tag] === undefined)
rv[tag] = [];
rv[tag].push({ id: rule.id, url: inputURL });
recognized = true;
if (onlyOneTag)
break;
}
}
}
// Sort surces in each tag using placement array, provided it's
// configured for this tag
if (sortResults) {
for (let tag in rv) {
if (Array.isArray(placement[tag]))
rv[tag].sort((obj1, obj2) => {
// Look up source IDs in the placement array
let p1 = placement[tag].indexOf(obj1.id);
let p2 = placement[tag].indexOf(obj2.id);
// For legibility
let p1Found = p1 != -1;
let p2Found = p2 != -1;
// A return value of -1 means p1 wins (is earlier in the result
// array); a return value of 1 means p2 wins.
//
// If one of the two isn't found in the placement array at all,
// the other wins. Otherwise, being later in the placement array
// means being later in the result array.
if (p1 > p2)
return !p2Found ? -1 : 1;
else if (p1 < p2)
return !p1Found ? 1 : -1;
else
return 0; // We don't care
});
}
}
if (!recognized) {
if (rv.other === undefined)
rv.other = [];
rv.other.push({ id: 'unknown', url: inputURL });
}
}
return rv;
},
prettify(inputURL) {
return inputURL
.replace(/^.*?:\/\//, '') // strip protocol
.replace(/\/$/, ''); // remove trailing slashes
}
};
/**
* Open Library "work" and "edition" URLs contain a human-readable string at
* the end, derived from the title. This string is optional, and it causes
* problems matching URLs against each other, so we remove it.
*
* @param {String} inputURL
* the Open Library URL
* @returns {String}
* canonicalized URL
* @memberof URLUtils
* @protected
*/
function _stripOpenLibraryTitleSuffix(inputURL) {
let match = inputURL.match(new RegExp('^https*://openlibrary.org/(works|books)/(OL[^/]+)/*(.*)$', 'i'));
if (match === null)
return inputURL;
else
return `https://openlibrary.org/${match[1]}/${match[2]}`;
}
function _stripAmazonQueryStrings(inputURL) {
let regex = /(.*\/)ref=.*$/;
let match = inputURL.match(regex);
if (Array.isArray(match) && match[1])
return match[1];
else
return inputURL;
}
/**
* Generic converter to discard the fragment portion of an input URL. This can
* be useful for sites where fragments are frequently used and likely to be part
* of the URL a user copies into the clipboard.
*
* @param {String} inputURL
* any URL
* @returns {String}
* URL without fragment
* @memberof URLUtils
* @protected
*/
function _stripFragment(inputURL) {
return inputURL.split('#')[0];
}
module.exports = urlUtils;