Saya mencoba mengambil URL dari https://en.wikipedia.org/wiki/List_of_hedge_funds dengan menggunakan aktor Apify yang disebut "web-scraper" (https://apify.com/apify/web-scraper)

Secara khusus, saya mencoba menggunakan Apify pageFunction berikut untuk mengikis halaman target itu dan mengembalikan daftar URL dari tag jangkar yang ada di HTML.

async function pageFunction( context ) {
    const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
    const cssSelector = 'tr > td > a';

    const $ = context.jQuery;
    const pageTitle = $('title').first().text();
    const anchorTag = $( cssSelector );

    return {
      url: context.request.url,
      pageTitle, anchorTag,
    };
}

Di konsol saya, saya berharap untuk melihat nilai atribut href dari satu atau lebih tag jangkar yang ada di halaman target di properti yang disebut anchorTag. Saya juga berharap untuk melihat judul halaman di properti yang disebut pageTitle dan properti url. Sebagai berikut:

{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "anchorTag": {
    "0": "http://example0.com", // each instance of "http://example.com" represents a unique url on the target page to be scraped
    "1": "http://example1.com",
    "2": "http://example2.com",
    "3": "http://example3.com",
    ...
    "39": "http://example39.com",
}}

Namun alih-alih daftar URL, aktor mengembalikan kumpulan data berikut:

[{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "anchorTag": {
    "0": {},
    "1": {},
    "2": {},
    "3": {},
    "4": {},
    "5": {},
    "6": {},
    "7": {},
    "8": {},
    "9": {},
    "10": {},
    "11": {},
    "12": {},
    "13": {},
    "14": {},
    "15": {},
    "16": {},
    "17": {},
    "18": {},
    "19": {},
    "20": {},
    "21": {},
    "22": {},
    "23": {},
    "24": {},
    "25": {},
    "26": {},
    "27": {},
    "28": {},
    "29": {},
    "30": {},
    "31": {},
    "32": {},
    "33": {},
    "34": {},
    "35": {},
    "36": {},
    "37": {},
    "38": {},
    "39": {},
    "length": 40,
    "prevObject": {
      "0": {
        "location": {
          "href": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
          "ancestorOrigins": {},
          "origin": "https://en.wikipedia.org",
          "protocol": "https:",
          "host": "en.wikipedia.org",
          "hostname": "en.wikipedia.org",
          "port": "",
          "pathname": "/wiki/List_of_hedge_funds",
          "search": "",
          "hash": "",
          "assign": {},
          "reload": {},
          "toString": {},
          "replace": {}
        },
        "write": {},
        "writeln": {},
        "jQuery3410461525655351679551": {
          "events": {
            "mmv-setup-overlay": [
              {
                "type": "mmv-setup-overlay",
                "origType": "mmv-setup-overlay",
                "handler": {
                  "guid": 21
                },
                "guid": 21,
                "namespace": ""
              }
            ],
            "mmv-cleanup-overlay": [
              {
                "type": "mmv-cleanup-overlay",
                "origType": "mmv-cleanup-overlay",
                "handler": {
                  "guid": 22
                },
                "guid": 22,
                "namespace": ""
              }
            ],
            "keyup": [
              {
                "type": "keyup",
                "origType": "keyup",
                "handler": {
                  "guid": 24
                },
                "guid": 24,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "mouseover": [
              {
                "type": "mouseover",
                "origType": "mouseover",
                "handler": {
                  "guid": 24
                },
                "guid": 24,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "focusout": [
              {
                "type": "focusout",
                "origType": "blur",
                "handler": {
                  "guid": 25
                },
                "guid": 25,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "mouseout": [
              {
                "type": "mouseout",
                "origType": "mouseout",
                "handler": {
                  "guid": 25
                },
                "guid": 25,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "click": [
              {
                "type": "click",
                "origType": "click",
                "handler": {
                  "guid": 26
                },
                "guid": 26,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ]
          },
          "handle": {},
          "focusin": 1,
          "focusout": 1
        }
      },
      "length": 1
    }
  }
}]

Apa yang saya lakukan salah?

1
NoUsername9 28 Februari 2020, 05:59

1 menjawab

Jawaban Terbaik

Anda harus mengakses atribut href dari tag a untuk mendapatkan URL. Selain itu, Anda perlu mengulang semua tag a untuk memasukkannya ke dalam satu larik.

// ...
const anchorTag = $( cssSelector );
const links = [];

// anchorTag in a JQuery handle, not a normal JavaScript value so it has special JQuery methods
anchorTag.each((index, el) => {
    const link = $(el).attr('href');
    if (link) {
         links.push(link);
    }
})

return {
   url: context.request.url,
   pageTitle,
   links,
};

2
Lukáš Křivka 28 Februari 2020, 14:30