I'm having numerous issues getting chromedriver/google chrome to work on ubuntu. While I've gotten it to work sporadically, it has never worked consistently. This is an ec2 instance, running a python script.
chrome version is 91.0.4472.114 and chromedriver version is 91.0.4472.101
However, when I type google-chrome in the terminal, I get:
[1799:1799:0620/033825.232832:ERROR:browser_main_loop.cc(1402)] Unable to open X display.
This is even after installing xvfb : sudo apt-get install -y xvfb
Maybe that's causing my script to fail? Here's my script, a simple web scraper so far:
from bs4 import BeautifulSoup
import requests
import string
import json
import geocoder
import mapbox
import selenium
from selenium import webdriver
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import datetime
from datetime import datetime as dt
import re
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import ElementNotVisibleException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
chrome_options = Options()
#Set up web driver and base URL
driver = webdriver.Chrome(options=chrome_options, executable_path='/usr/bin/chromedriver')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--headless')
#Set base url (SAN FRANCISCO)
base_url = 'https://www.bandsintown.com/?place_id=ChIJIQBpAG2ahYAR_6128GcTUEo&page='#san francisco
#base_url = 'https://www.bandsintown.com/?place_id=ChIJOwg_06VPwokRYv534QaPC8g&page='
events = []
eventContainerBucket = []
for i in range(1,2):
#cycle through pages in range
driver.get(base_url + str(i))
pageURL = base_url + str(i)
print(pageURL)
# get events links
event_list = driver.find_elements_by_css_selector('div[class^=_3buUBPWBhUz9KBQqgXm-gf] a[class^=_3UX9sLQPbNUbfbaigy35li]')
# collect href attribute of events in even_list
events.extend(list(event.get_attribute("href") for event in event_list))
print ("total events: ", (len(events)))
# iterate through all events and open them.
item = {}
allEvents = []
for event in events:
driver.get(event)
currentUrl = driver.current_url
print(currentUrl)
try:
currentRequest = requests.get(currentUrl)
#print currentRequest.status_code
except requests.exceptions.RequestException as e:
print(e)
continue
So, it runs the script but gets hung up on the printing of the URLs, and I've gotten this error, which is of course unhelpful:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "BandsintownWebScraper.py", line 106, in <module>
currentUrl = driver.current_url
File "/home/ubuntu/.local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 669, in current_url
return self.execute(Command.GET_CURRENT_URL)['value']
File "/home/ubuntu/.local/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "/home/ubuntu/.local/lib/python3.6/site-packages/selenium/webdriver/remote/remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "/home/ubuntu/.local/lib/python3.6/site-packages/selenium/webdriver/remote/remote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "/usr/lib/python3/dist-packages/urllib3/request.py", line 66, in request
**urlopen_kw)
File "/usr/lib/python3/dist-packages/urllib3/request.py", line 87, in request_encode_url
return self.urlopen(method, url, **extra_kw)
File "/usr/lib/python3/dist-packages/urllib3/poolmanager.py", line 322, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 601, in urlopen
chunked=chunked)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.6/http/client.py", line 1373, in getresponse
response.begin()
File "/usr/lib/python3.6/http/client.py", line 311, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.6/http/client.py", line 272, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
KeyboardInterrupt
The funny thing is, this script has worked in the past, but it fails to work consistently. It's failing when looping through the urls. Any help would be appreciated; I've spent so many hours trying to get this thing to work.
from Chrome on ubuntu - Unable to open X display and unknown error: Chrome failed to start: exited abnormally
No comments:
Post a Comment