diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b0bbcfe --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv/ +output.xml diff --git a/LICENSE b/LICENSE index 2071b23..aeccf11 100644 --- a/LICENSE +++ b/LICENSE @@ -1,9 +1,21 @@ MIT License -Copyright (c) +Copyright (c) 2023 Líng Yì -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index 9762576..ac6cda7 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,53 @@ -# Web2RDF +# RDF Implementation Code -Code for scraping e-commerce websites (Bukalapak, Tokopedia) & generating RDF data in XML format. \ No newline at end of file +This repository contains code for scraping product information from e-commerce websites (Bukalapak and Tokopedia) and generating RDF data based on the scraped information. The RDF data is saved in an XML format. + +## Contents + +- `main.py`: The main Python script that performs the scraping and RDF generation. +- `links.json`: A JSON file that contains the URLs of the product pages to be scraped from Bukalapak and Tokopedia. +- `requirements.txt`: A file specifying the dependencies required to run the code. + +## Requirements + +To install the dependencies, use the following command: + +``` +pip install -r requirements.txt +``` + +The following dependencies are required: + +- `requests-html`: A library for making HTTP requests and parsing HTML responses. +- `beautifulsoup4`: A library for parsing HTML and XML documents. +- `rdflib`: A library for working with RDF (Resource Description Framework) data. + +## Usage + +To run the code, use the following command: + +``` +python main.py [-d {tokopedia,bukalapak}] [-s {tokopedia,bukalapak}] +``` + +- `-d, --debug {tokopedia,bukalapak}`: Enable debug mode for the specified scraper. This will display additional logging information. +- `-s, --source {tokopedia,bukalapak}`: Specify the data source for scraping. If not provided, the code will scrape from both Bukalapak and Tokopedia. + +The code will read the URLs from the `links.json` file and scrape the product information from the specified e-commerce websites. It will then generate RDF data based on the scraped information and save it in an XML file named `output.xml`. If the file already exists, a numbered suffix will be added to the filename (e.g., `output_1.xml`, `output_2.xml`, etc.). + +## Implementation Details + +The code utilizes the following libraries and techniques: + +- `HTMLSession` from `requests-html` is used to make HTTP requests and retrieve the HTML content of the product pages. +- `BeautifulSoup` from `beautifulsoup4` is used to parse the HTML and extract the desired information such as product name, price, image, and specifications. +- The `rdflib` library is used to create an RDF graph, define a custom namespace, and add RDF triples representing the scraped product data. +- The RDF data is saved in an XML format using the `serialize` method provided by `rdflib`. +- The code handles URL encoding and decoding to ensure proper handling of special characters in the URLs. +- Logging is used to provide information about the scraping process and any errors that occur. + +Feel free to explore and modify the code according to your specific requirements. If you have any questions or need assistance, please don't hesitate to reach out. + +## License + +This code is licensed under the MIT License. See the [LICENSE](LICENSE) file for more information. diff --git a/links.json b/links.json new file mode 100644 index 0000000..49f3425 --- /dev/null +++ b/links.json @@ -0,0 +1,22 @@ +{ + "bukalapak": [ + "https://www.bukalapak.com/p/handphone/tablet/ipad/4girvb1-jual-apple-ipad-9-9th-gen-2021-10-2-inch-64gb-256gb-64-256-wifi-only-bnib", + "https://www.bukalapak.com/p/handphone/tablet/ipad/4ha884s-jual-ipad-pro-2021-m1-chip-11-inch-128gb-256-512-1tb-2tb-silver-gray-wifi-cellular", + "https://www.bukalapak.com/p/handphone/hp-smartphone/iphone/4hlmbcs-jual-apple-iphone-12-64gb-garansi-resmi-ibox", + "https://www.bukalapak.com/p/handphone/hp-smartphone/iphone/4h2ro9d-jual-apple-iphone-13-128gb-garansi-resmi", + "https://www.bukalapak.com/p/handphone/hp-smartphone/iphone/4hlmbjm-jual-apple-iphone-14-128-256-512-garansi-resmi-ibox", + "https://www.bukalapak.com/p/komputer/laptop/48w9rmy-jual-macbook-pro-2020-13-inch-m1-8-core-cpu-8-core-gpu-8gb-512gb-resmi-ibox", + "https://www.bukalapak.com/p/komputer/laptop/laptop-ultrabook/4go90ls-jual-new-macbook-pro-2021-14-inch-m1-pro-10-8-cpu-16-14-gpu-16gb-512gb-1tb", + "https://www.bukalapak.com/p/komputer/laptop/laptop-ultrabook/4h9pprs-jual-new-macbook-pro-m2-chip-2022-13-inch-8-core-cpu-10-core-gpu-8gb-256gb" + ], + "tokopedia": [ + "https://www.tokopedia.com/chocoapple/apple-ipad-9-9th-gen-2021-10-2-inch-64gb-256gb-64-256-wifi-only-ibox-64gb", + "https://www.tokopedia.com/fionflandshop/ipad-pro-2021-m1-chip-11-dan-12-9-128gb-256gb-wifi-cellular-11-256-gb", + "https://www.tokopedia.com/putragroup/ibox-ipad-pro-m1-2021-11-128gb-256gb-512gb-1tb-2tb-wifi-cell-inter-wifi-only-128gb-grey", + "https://www.tokopedia.com/bagindosonline/apple-iphone-12-128gb-garansi-resmi-ibox-tam-digimap-bnib-segel-biru-ce947", + "https://www.tokopedia.com/cmpphone/apple-iphone-13-128gb-grs-resmi-ibox-indonesia-midnight", + "https://www.tokopedia.com/chocoapple/macbook-pro-2020-13-inch-m1-8-core-cpu-8-core-gpu-8gb-512gb-resmi-ibox-silver", + "https://www.tokopedia.com/ljsofficial/apple-macbook-pro-14-2-inci-m1-pro-2021-8c-cpu-14c-gpu-512gb-ibox-grey-a11b1", + "https://www.tokopedia.com/riomorycell/macbook-pro-m2-2022-ram-8gb-ssd-512-touchbar-mnej3id-a-space-grey-ibox" + ] +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..acefa18 --- /dev/null +++ b/main.py @@ -0,0 +1,160 @@ +import json +import argparse +from requests_html import HTMLSession +import time +import logging +from bs4 import BeautifulSoup +from rdflib import Graph, Literal, Namespace, RDF, URIRef +import urllib.parse +from urllib.parse import quote +import os + +def create_namespace(): + NS = Namespace("http://example.org/") + g = Graph() + g.bind("ex", NS) + g.add((NS.Product, RDF.type, URIRef("http://www.w3.org/2000/01/rdf-schema#Class"))) + g.add((NS.hasName, RDF.type, RDF.Property)) + g.add((NS.hasPrice, RDF.type, RDF.Property)) + g.add((NS.listedOn, RDF.type, RDF.Property)) + g.add((NS.hasSourceURL, RDF.type, RDF.Property)) + return NS, g + +def scrape_bukalapak(url, debug=False): + url = urllib.parse.unquote(url) # Decode the URL + logging.debug("Scraping Bukalapak URL: %s", url) + try: + session = HTMLSession() + response = session.get(url, timeout=10) + logging.debug("Response status code: %d", response.status_code) + soup = BeautifulSoup(response.content, 'html.parser') + + if debug: + logging.debug("\n%s", soup.prettify()) + + name = soup.select_one("h1").text.strip() + price = soup.select_one("div.c-main-product__price div.c-product-price span").text.strip() + + image_element = soup.select_one('div[data-testid="slider-items"] > picture > img') + if image_element: + image = image_element['src'] + else: + image = '' + + specs = {} + specs_iterator = soup.select("tr") + for spec_row in specs_iterator: + key = spec_row.select_one("th") + value = spec_row.select_one("td:last-child") + if key and value: + specs[key.text.strip()] = value.text.strip() + + return name, price, image, specs + except requests.exceptions.RequestException as e: + logging.error("An error occurred while scraping Bukalapak: %s", str(e)) + return None, None, None, None + + +def scrape_tokopedia(url, debug=False): + url = urllib.parse.unquote(url) # Decode the URL + logging.debug("Scraping Tokopedia URL: %s", url) + try: + session = HTMLSession() + response = session.get(url, timeout=10) + logging.debug("Response status code: %d", response.status_code) + soup = BeautifulSoup(response.content, 'html.parser') + + if debug: + logging.debug("\n%s", soup.prettify()) + + name = soup.select_one("h1").text.strip() + price = soup.select_one("div.price").text.strip() + + image_element = soup.select_one('img[data-testid="PDPMainImage"]') + if image_element: + image = image_element['src'] + else: + image = '' + + specs = {} # Currently not available on Tokopedia + + return name, price, image, specs + except requests.exceptions.RequestException as e: + logging.error("An error occurred while scraping Tokopedia: %s", str(e)) + return None, None, None, None + + +def process_data(NS, g, scraper_name, urls, debug=False): + if scraper_name == 'bukalapak': + scrape_func = scrape_bukalapak + elif scraper_name == 'tokopedia': + scrape_func = scrape_tokopedia + else: + logging.error("Scraper '%s' is not supported.", scraper_name) + return + + for i, url in enumerate(urls, start=1): + url = quote(url) # Encode the URL + name, price, image, specs = scrape_func(url, debug=debug) + + if name is None or price is None: + logging.warning("Skipping URL %s. Failed to scrape product details.", url) + continue + + # Replace invalid characters in the product name for the URI + product_name = name.replace(" ", "_").replace('"', '') + + product = URIRef(NS + product_name) + g.add((product, RDF.type, NS.Product)) + g.add((product, NS.hasName, Literal(name))) + g.add((product, NS.hasPrice, Literal(price))) + g.add((product, NS.listedOn, Literal(scraper_name.capitalize()))) + g.add((product, NS.hasSourceURL, Literal(url))) + + if image: + g.add((product, NS.hasImage, URIRef(image))) + + if specs: + for key, value in specs.items(): + g.add((product, URIRef(NS + key.replace(" ", "_")), Literal(value))) + + logging.info("Scraped product %d/%d from %s: %s", i, len(urls), scraper_name, name) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--debug', choices=['tokopedia', 'bukalapak'], help='Enable debug mode for the specified scraper') + parser.add_argument('-s', '--source', choices=['tokopedia', 'bukalapak'], help='Specify the data source for scraping') + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') + + links_file = 'links.json' + + with open(links_file) as file: + data = json.load(file) + + NS, g = create_namespace() + + if args.source: + if args.source in data: + process_data(NS, g, args.source, data[args.source], debug=args.debug == args.source) + else: + logging.error("Data source '%s' is not available in the links file.", args.source) + else: + for scraper_name, urls in data.items(): + process_data(NS, g, scraper_name, urls, debug=args.debug == scraper_name) + + output_file = 'output.xml' + count = 1 + while os.path.exists(output_file): + count += 1 + output_file = f'output_{count}.xml' + rdf_xml = g.serialize(format="xml") + with open(output_file, 'w', encoding='utf-8') as file: + file.write(rdf_xml) + logging.info("RDF data saved to '%s'.", output_file) + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bd1bba9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4==4.12.2 +rdflib==6.3.2 +requests_html==0.10.0