Saya mencoba untuk menonaktifkan JavaScript sehingga situs web tahu bahwa JavaScript dinonaktifkan pada tungkai pupeeter (yaitu: <noscript> Tags) di kelas dasar yang dibuat untuk merangkak situs web namun skrip saya gagal sehingga tidak menonaktifkan javascript ketika saya pergi situs web apa pun. Ini kode saya:

// https://stackoverflow.com/questions/39134419/run-tor-browser-with-selenium-webdriver
import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer";
import { readFileSync } from "fs"
import { helpers } from "./helpers";
import _ from "lodash"

/**
 * Base class for all crawler
 */
abstract class BaseCrawler {
  public static readonly TOR_PATH = process.env.TOR_PATH ?? "";
  public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? "";
  public static readonly TORRC_PATH = process.env.TORRC_PATH;
  public static headless = false
  public readonly browser: Promise<puppeteer.Browser>;
  private readonly jsEnabled: boolean;

  /**
   * get the active page
   * @returns null if it couldn't get the active
   */
  public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> {
    const browser = await this.browser;
    var start = new Date().getTime();
    while (new Date().getTime() - start < timeout) {
      var pages = await browser.pages();
      var arr = [];
      for (const p of pages) {
        if (await p.evaluate(() => { return document.visibilityState == 'visible' })) {
          arr.push(p);
        }
      }

      if (arr.length == 1)
        return arr[0];
    }
    return null;
  }

  constructor(jsEnabled = false, website = "https://google.com") {
    console.log(Browser)
    
    this.browser = puppeteer.launch({
      headless: BaseCrawler.headless,
      //args: ["--proxy-server=socks5://127.0.0.1:9050"],
      userDataDir: "./.headless-data"
    });

    this.jsEnabled = jsEnabled;
    this.browser.then(async (b) => {
      b.on("targetcreated", async (e: Target) => {

        const page = await e.page();
        // set a tor useragent
        page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`);

        // disable script if it's aksed
        if (page?.url()) {
          // console.log(page.url().search("chrome://"))
          if (page.url().search("chrome://") < 0)
            page?.setJavaScriptEnabled(this.jsEnabled)
        }

        page?.on('request', request => {
          if (request.resourceType() === 'script')
            request.abort();
          else
            request.continue();
        })
      })
    })

    this.browser.then(async (b) => {
      b.newPage()
      const page = await b.newPage();
      page.goto(website);
    })
  }
}
/** Bland tor window just made to browse tor */
export class TorWindow extends BaseCrawler {
};

Saya mencoba mengaitkan fungsi newPage namun tidak berfungsi karena memberi saya kesalahan berikut:

PS C:\Users\vince\project\js\crawler-project> yarn run browser
yarn run v1.22.10
$ node . --tor-window
undefined
C:\Users\vince\project\js\crawler-project\\dist\Crawler.js:35
        const old_newpage = puppeteer_1.default.Browser.prototype.newPage;
                                                        ^

TypeError: Cannot read property 'prototype' of undefined
    at new BaseCrawler (C:\Users\vince\project\js\crawler-project\dist\Crawler.js:35:57)
    at new TorWindow (C:\Users\vince\project\js\crawler-project\dist\Crawler.js:98:1)
    at Object.<anonymous> (C:\Users\vince\project\js\crawler-project\dist\index.js:27:5)
    at Module._compile (node:internal/modules/cjs/loader:1092:14)
    at Object.Module._extensions..js (node:internal/modules/cjs/loader:1121:10)
    at Module.load (node:internal/modules/cjs/loader:972:32)
    at Function.Module._load (node:internal/modules/cjs/loader:813:14)
    at Function.executeUserEntryPoint [as runMain] (node:internal/modules/run_main:76:12)
    at node:internal/main/run_main_module:17:47
error Command failed with exit code 1.
info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
// https://stackoverflow.com/questions/39134419/run-tor-browser-with-selenium-webdriver
import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer";
import { readFileSync } from "fs"
import { helpers } from "./helpers";
import _ from "lodash"

/**
 * Base class for all crawler
 */
abstract class BaseCrawler {
  public static readonly TOR_PATH = process.env.TOR_PATH ?? "";
  public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? "";
  public static readonly TORRC_PATH = process.env.TORRC_PATH;
  public static headless = false
  public readonly browser: Promise<puppeteer.Browser>;
  private readonly jsEnabled: boolean;

  /**
   * get the active page
   * @returns null if it couldn't get the active
   */
  public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> {
    const browser = await this.browser;
    var start = new Date().getTime();
    while (new Date().getTime() - start < timeout) {
      var pages = await browser.pages();
      var arr = [];
      for (const p of pages) {
        if (await p.evaluate(() => { return document.visibilityState == 'visible' })) {
          arr.push(p);
        }
      }

      if (arr.length == 1)
        return arr[0];
    }
    return null;
  }

  constructor(jsEnabled = false, website = "https://check.torproject.org") {
    console.log(Browser)
    const old_newpage = puppeteer.Browser.prototype.newPage
    puppeteer.Browser.prototype.newPage = async () => {
      const page = await old_newpage()
      page.setJavaScriptEnabled(this.jsEnabled)
      return page;
    }
    this.browser = puppeteer.launch({
      headless: BaseCrawler.headless,
      args: ["--proxy-server=socks5://127.0.0.1:9050"],
      userDataDir: "./.headless-data"
    });

    this.jsEnabled = jsEnabled;
    this.browser.then(async (b) => {
      b.on("targetcreated", async (e: Target) => {

        const page = await e.page();
        // set a tor useragent
        page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`);

        // disable script if it's aksed
        if (page?.url()) {
          // console.log(page.url().search("chrome://"))
          if (page.url().search("chrome://") < 0)
            page?.setJavaScriptEnabled(this.jsEnabled)
        }

        page?.on('request', request => {
          if (request.resourceType() === 'script')
            request.abort();
          else
            request.continue();
        })
      })
    })

    this.browser.then(async (b) => {
      b.newPage()
      const page = await b.newPage();
      page.goto(website);
    })
  }
}
/** Bland tor window just made to browse tor */
export class TorWindow extends BaseCrawler {
};
0
Vini Dalvino 4 April 2021, 06:04

2 jawaban

Jawaban Terbaik

Coba yang berikut ini:

page.setJavascriptEnabled(false)

Tetapkan ini sebelum menavigasi ke situs web.

Informasi lebih lanjut untuk dibaca tentang ini yang dapat Anda temukan di: halaman .SetJavascregTenabled. (diaktifkan)

1
Martin 4 April 2021, 17:03

Untuk menonaktifkan Javascript, kita perlu memonitor semua permintaan / tanggapan yang mengalir. Kemudian berdasarkan jenisnya, kita dapat memutuskan untuk mengakhiri permintaan / tanggapan.

Dalam contoh di bawah ini, kami akan memuat flipkart.com tanpa menggunakan file javascript.

const puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.setRequestInterception(true);
  page.on('request', request => {
    if (request.resourceType() === 'script')
      request.abort();
    else
      request.continue();
  });
  await page.goto('https://www.flipkart.com');
  await browser.close();
  })();

Sumber: https://chercher.tech/puppreteer

Jika Anda tahu situs web itu akan menyajikan halaman tanpa JS untuk crawler tertentu, Anda dapat mencoba mengganti header.

1
undg 4 April 2021, 03:18