Sistema de Gestión Documental
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

500 lines
15 KiB

5 years ago
  1. 'use strict';
  2. var http = require('http');
  3. var https = require('https');
  4. var urllib = require('url');
  5. var utillib = require('util');
  6. var zlib = require('zlib');
  7. var dns = require('dns');
  8. var Stream = require('stream').Readable;
  9. var CookieJar = require('./cookiejar').CookieJar;
  10. var encodinglib = require('encoding');
  11. var net = require('net');
  12. var USE_ALLOC = typeof Buffer.alloc === 'function';
  13. exports.FetchStream = FetchStream;
  14. exports.CookieJar = CookieJar;
  15. exports.fetchUrl = fetchUrl;
  16. function FetchStream(url, options) {
  17. Stream.call(this);
  18. options = options || {};
  19. this.url = url;
  20. if (!this.url) {
  21. return this.emit('error', new Error('url not defined'));
  22. }
  23. this.userAgent = options.userAgent || 'FetchStream';
  24. this._redirect_count = 0;
  25. this.options = options || {};
  26. this.normalizeOptions();
  27. // prevent errors before 'error' handler is set by defferring actions
  28. if (typeof setImmediate !== 'undefined') {
  29. setImmediate(this.runStream.bind(this, url));
  30. } else {
  31. process.nextTick(this.runStream.bind(this, url));
  32. }
  33. this.responseBuffer = USE_ALLOC ? Buffer.alloc(0, '', 'binary') : new Buffer(0, 'binary');
  34. this.ended = false;
  35. this.readyToRead = 0;
  36. }
  37. utillib.inherits(FetchStream, Stream);
  38. FetchStream.prototype._read = function (size) {
  39. if (this.ended && this.responseBuffer.length === 0) {
  40. this.push(null);
  41. return;
  42. }
  43. this.readyToRead += size;
  44. this.drainBuffer();
  45. };
  46. FetchStream.prototype.drainBuffer = function () {
  47. if (this.readyToRead === 0) {
  48. return;
  49. }
  50. if (this.responseBuffer.length === 0) {
  51. return;
  52. }
  53. var push;
  54. var rest;
  55. var restSize;
  56. if (this.responseBuffer.length > this.readyToRead) {
  57. push = USE_ALLOC ? Buffer.alloc(this.readyToRead, '', 'binary') : new Buffer(this.readyToRead, 'binary');
  58. this.responseBuffer.copy(push, 0, 0, this.readyToRead);
  59. restSize = this.responseBuffer.length - this.readyToRead;
  60. rest = USE_ALLOC ? Buffer.alloc(restSize, '', 'binary') : new Buffer(restSize, 'binary');
  61. this.responseBuffer.copy(rest, 0, this.readyToRead);
  62. } else {
  63. push = this.responseBuffer;
  64. rest = USE_ALLOC ? Buffer.alloc(0, '', 'binary') : new Buffer(0, 'binary');
  65. }
  66. this.responseBuffer = rest;
  67. this.readyToRead = 0;
  68. if (this.options.encoding) {
  69. this.push(push, this.options.encoding);
  70. } else {
  71. this.push(push);
  72. }
  73. };
  74. FetchStream.prototype.destroy = function (ex) {
  75. this.emit('destroy', ex);
  76. };
  77. FetchStream.prototype.normalizeOptions = function () {
  78. // cookiejar
  79. this.cookieJar = this.options.cookieJar || new CookieJar();
  80. // default redirects - 10
  81. // if disableRedirect is set, then 0
  82. if (!this.options.disableRedirect && typeof this.options.maxRedirects !== 'number' &&
  83. !(this.options.maxRedirects instanceof Number)) {
  84. this.options.maxRedirects = 10;
  85. } else if (this.options.disableRedirects) {
  86. this.options.maxRedirects = 0;
  87. }
  88. // normalize header keys
  89. // HTTP and HTTPS takes in key names in case insensitive but to find
  90. // an exact value from an object key name needs to be case sensitive
  91. // so we're just lowercasing all input keys
  92. this.options.headers = this.options.headers || {};
  93. var keys = Object.keys(this.options.headers);
  94. var newheaders = {};
  95. var i;
  96. for (i = keys.length - 1; i >= 0; i--) {
  97. newheaders[keys[i].toLowerCase().trim()] = this.options.headers[keys[i]];
  98. }
  99. this.options.headers = newheaders;
  100. if (!this.options.headers['user-agent']) {
  101. this.options.headers['user-agent'] = this.userAgent;
  102. }
  103. if (!this.options.headers.pragma) {
  104. this.options.headers.pragma = 'no-cache';
  105. }
  106. if (!this.options.headers['cache-control']) {
  107. this.options.headers['cache-control'] = 'no-cache';
  108. }
  109. if (!this.options.disableGzip) {
  110. this.options.headers['accept-encoding'] = 'gzip, deflate';
  111. } else {
  112. delete this.options.headers['accept-encoding'];
  113. }
  114. // max length for the response,
  115. // if not set, default is Infinity
  116. if (!this.options.maxResponseLength) {
  117. this.options.maxResponseLength = Infinity;
  118. }
  119. // method:
  120. // defaults to GET, or when payload present to POST
  121. if (!this.options.method) {
  122. this.options.method = this.options.payload || this.options.payloadSize ? 'POST' : 'GET';
  123. }
  124. // set cookies
  125. // takes full cookie definition strings as params
  126. if (this.options.cookies) {
  127. for (i = 0; i < this.options.cookies.length; i++) {
  128. this.cookieJar.setCookie(this.options.cookies[i], this.url);
  129. }
  130. }
  131. // rejectUnauthorized
  132. if (typeof this.options.rejectUnauthorized === 'undefined') {
  133. this.options.rejectUnauthorized = true;
  134. }
  135. };
  136. FetchStream.prototype.parseUrl = function (url) {
  137. var urlparts = urllib.parse(url, false, true),
  138. transport,
  139. urloptions = {
  140. host: urlparts.hostname || urlparts.host,
  141. port: urlparts.port,
  142. path: urlparts.pathname + (urlparts.search || '') || '/',
  143. method: this.options.method,
  144. rejectUnauthorized: this.options.rejectUnauthorized
  145. };
  146. switch (urlparts.protocol) {
  147. case 'https:':
  148. transport = https;
  149. break;
  150. case 'http:':
  151. default:
  152. transport = http;
  153. break;
  154. }
  155. if (transport === https) {
  156. if('agentHttps' in this.options){
  157. urloptions.agent = this.options.agentHttps;
  158. }
  159. if('agent' in this.options){
  160. urloptions.agent = this.options.agent;
  161. }
  162. } else {
  163. if('agentHttp' in this.options){
  164. urloptions.agent = this.options.agentHttp;
  165. }
  166. if('agent' in this.options){
  167. urloptions.agent = this.options.agent;
  168. }
  169. }
  170. if (!urloptions.port) {
  171. switch (urlparts.protocol) {
  172. case 'https:':
  173. urloptions.port = 443;
  174. break;
  175. case 'http:':
  176. default:
  177. urloptions.port = 80;
  178. break;
  179. }
  180. }
  181. urloptions.headers = this.options.headers || {};
  182. if (urlparts.auth) {
  183. var buf = USE_ALLOC ? Buffer.alloc(Buffer.byteLength(urlparts.auth), urlparts.auth) : new Buffer(urlparts.auth);
  184. urloptions.headers.Authorization = 'Basic ' + buf.toString('base64');
  185. }
  186. return {
  187. urloptions: urloptions,
  188. transport: transport
  189. };
  190. };
  191. FetchStream.prototype.setEncoding = function (encoding) {
  192. this.options.encoding = encoding;
  193. };
  194. FetchStream.prototype.runStream = function (url) {
  195. var url_data = this.parseUrl(url),
  196. cookies = this.cookieJar.getCookies(url);
  197. if (cookies) {
  198. url_data.urloptions.headers.cookie = cookies;
  199. } else {
  200. delete url_data.urloptions.headers.cookie;
  201. }
  202. if (this.options.payload) {
  203. url_data.urloptions.headers['content-length'] = Buffer.byteLength(this.options.payload || '', 'utf-8');
  204. }
  205. if (this.options.payloadSize) {
  206. url_data.urloptions.headers['content-length'] = this.options.payloadSize;
  207. }
  208. if (this.options.asyncDnsLoookup) {
  209. var dnsCallback = (function (err, addresses) {
  210. if (err) {
  211. this.emit('error', err);
  212. return;
  213. }
  214. url_data.urloptions.headers.host = url_data.urloptions.hostname || url_data.urloptions.host;
  215. url_data.urloptions.hostname = addresses[0];
  216. url_data.urloptions.host = url_data.urloptions.headers.host + (url_data.urloptions.port ? ':' + url_data.urloptions.port : '');
  217. this._runStream(url_data, url);
  218. }).bind(this);
  219. if (net.isIP(url_data.urloptions.host)) {
  220. dnsCallback(null, [url_data.urloptions.host]);
  221. } else {
  222. dns.resolve4(url_data.urloptions.host, dnsCallback);
  223. }
  224. } else {
  225. this._runStream(url_data, url);
  226. }
  227. };
  228. FetchStream.prototype._runStream = function (url_data, url) {
  229. var req = url_data.transport.request(url_data.urloptions, (function (res) {
  230. // catch new cookies before potential redirect
  231. if (Array.isArray(res.headers['set-cookie'])) {
  232. for (var i = 0; i < res.headers['set-cookie'].length; i++) {
  233. this.cookieJar.setCookie(res.headers['set-cookie'][i], url);
  234. }
  235. }
  236. if ([301, 302, 303, 307, 308].indexOf(res.statusCode) >= 0) {
  237. if (!this.options.disableRedirects && this.options.maxRedirects > this._redirect_count && res.headers.location) {
  238. this._redirect_count++;
  239. req.destroy();
  240. this.runStream(urllib.resolve(url, res.headers.location));
  241. return;
  242. }
  243. }
  244. this.meta = {
  245. status: res.statusCode,
  246. responseHeaders: res.headers,
  247. finalUrl: url,
  248. redirectCount: this._redirect_count,
  249. cookieJar: this.cookieJar
  250. };
  251. var curlen = 0,
  252. maxlen,
  253. receive = (function (chunk) {
  254. if (curlen + chunk.length > this.options.maxResponseLength) {
  255. maxlen = this.options.maxResponseLength - curlen;
  256. } else {
  257. maxlen = chunk.length;
  258. }
  259. if (maxlen <= 0) {
  260. return;
  261. }
  262. curlen += Math.min(maxlen, chunk.length);
  263. if (maxlen >= chunk.length) {
  264. if (this.responseBuffer.length === 0) {
  265. this.responseBuffer = chunk;
  266. } else {
  267. this.responseBuffer = Buffer.concat([this.responseBuffer, chunk]);
  268. }
  269. } else {
  270. this.responseBuffer = Buffer.concat([this.responseBuffer, chunk], this.responseBuffer.length + maxlen);
  271. }
  272. this.drainBuffer();
  273. }).bind(this),
  274. error = (function (e) {
  275. this.ended = true;
  276. this.emit('error', e);
  277. this.drainBuffer();
  278. }).bind(this),
  279. end = (function () {
  280. this.ended = true;
  281. if (this.responseBuffer.length === 0) {
  282. this.push(null);
  283. }
  284. }).bind(this),
  285. unpack = (function (type, res) {
  286. var z = zlib['create' + type]();
  287. z.on('data', receive);
  288. z.on('error', error);
  289. z.on('end', end);
  290. res.pipe(z);
  291. }).bind(this);
  292. this.emit('meta', this.meta);
  293. if (res.headers['content-encoding']) {
  294. switch (res.headers['content-encoding'].toLowerCase().trim()) {
  295. case 'gzip':
  296. return unpack('Gunzip', res);
  297. case 'deflate':
  298. return unpack('InflateRaw', res);
  299. }
  300. }
  301. res.on('data', receive);
  302. res.on('end', end);
  303. }).bind(this));
  304. req.on('error', (function (e) {
  305. this.emit('error', e);
  306. }).bind(this));
  307. if (this.options.timeout) {
  308. req.setTimeout(this.options.timeout, req.abort.bind(req));
  309. }
  310. this.on('destroy', req.abort.bind(req));
  311. if (this.options.payload) {
  312. req.end(this.options.payload);
  313. } else if (this.options.payloadStream) {
  314. this.options.payloadStream.pipe(req);
  315. this.options.payloadStream.resume();
  316. } else {
  317. req.end();
  318. }
  319. };
  320. function fetchUrl(url, options, callback) {
  321. if (!callback && typeof options === 'function') {
  322. callback = options;
  323. options = undefined;
  324. }
  325. options = options || {};
  326. var fetchstream = new FetchStream(url, options),
  327. response_data, chunks = [],
  328. length = 0,
  329. curpos = 0,
  330. buffer,
  331. content_type,
  332. callbackFired = false;
  333. fetchstream.on('meta', function (meta) {
  334. response_data = meta;
  335. content_type = _parseContentType(meta.responseHeaders['content-type']);
  336. });
  337. fetchstream.on('data', function (chunk) {
  338. if (chunk) {
  339. chunks.push(chunk);
  340. length += chunk.length;
  341. }
  342. });
  343. fetchstream.on('error', function (error) {
  344. if (error && error.code === 'HPE_INVALID_CONSTANT') {
  345. // skip invalid formatting errors
  346. return;
  347. }
  348. if (callbackFired) {
  349. return;
  350. }
  351. callbackFired = true;
  352. callback(error);
  353. });
  354. fetchstream.on('end', function () {
  355. if (callbackFired) {
  356. return;
  357. }
  358. callbackFired = true;
  359. buffer = USE_ALLOC ? Buffer.alloc(length) : new Buffer(length);
  360. for (var i = 0, len = chunks.length; i < len; i++) {
  361. chunks[i].copy(buffer, curpos);
  362. curpos += chunks[i].length;
  363. }
  364. if (content_type.mimeType === 'text/html') {
  365. content_type.charset = _findHTMLCharset(buffer) || content_type.charset;
  366. }
  367. content_type.charset = (options.overrideCharset || content_type.charset || 'utf-8').trim().toLowerCase();
  368. if (!options.disableDecoding && !content_type.charset.match(/^utf-?8$/i)) {
  369. buffer = encodinglib.convert(buffer, 'UTF-8', content_type.charset);
  370. }
  371. if (options.outputEncoding) {
  372. return callback(null, response_data, buffer.toString(options.outputEncoding));
  373. } else {
  374. return callback(null, response_data, buffer);
  375. }
  376. });
  377. }
  378. function _parseContentType(str) {
  379. if (!str) {
  380. return {};
  381. }
  382. var parts = str.split(';'),
  383. mimeType = parts.shift(),
  384. charset, chparts;
  385. for (var i = 0, len = parts.length; i < len; i++) {
  386. chparts = parts[i].split('=');
  387. if (chparts.length > 1) {
  388. if (chparts[0].trim().toLowerCase() === 'charset') {
  389. charset = chparts[1];
  390. }
  391. }
  392. }
  393. return {
  394. mimeType: (mimeType || '').trim().toLowerCase(),
  395. charset: (charset || 'UTF-8').trim().toLowerCase() // defaults to UTF-8
  396. };
  397. }
  398. function _findHTMLCharset(htmlbuffer) {
  399. var body = htmlbuffer.toString('ascii'),
  400. input, meta, charset;
  401. if ((meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i))) {
  402. input = meta[0];
  403. }
  404. if (input) {
  405. charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/);
  406. if (charset) {
  407. charset = (charset[1] || '').trim().toLowerCase();
  408. }
  409. }
  410. if (!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))) {
  411. charset = (meta[1] || '').trim().toLowerCase();
  412. }
  413. return charset;
  414. }