Twitter made a fundamental change not so long ago: you cannot view a user's timeline any more if you are not logged in. That broke all my scraping code. The code bellow works on my developer environment, but not on the server. In my developer environment it's also headless. As you can see I tried to sprinkle in sleeps to figure out if the script is moving too fast.
The script fails by not finding the password input. That input comes into view after filling in the username / email input and pressing the Next button. After filling in the password I'd need to click the "Log in" button.
TWITTER_URL_BASE = "https://twitter.com/"
SELENIUM_INSTANCE_WAIT = 1
@classmethod
def _get_driver(cls):
driver = None
firefox_options = FirefoxOptions()
firefox_options.headless = True
firefox_options.add_argument("width=1920")
firefox_options.add_argument("height=1080")
firefox_options.add_argument("window-size=1920,1080")
firefox_options.add_argument("disable-gpu")
# https://stackoverflow.com/questions/24653127/selenium-error-no-display-specified
# export MOZ_HEADLESS=1
firefox_options.binary_location = "/usr/bin/firefox"
# firefox_options.set_preference("extensions.enabledScopes", 0)
# firefox_options.set_preference("gfx.webrender.all", False)
# firefox_options.set_preference("layers.acceleration.disabled", True)
firefox_binary = FirefoxBinary("/usr/bin/firefox")
firefox_profile = FirefoxProfile()
firefox_options.binary = "/usr/bin/firefox" # firefox_binary
firefox_options.profile = firefox_profile
capabilities = DesiredCapabilities.FIREFOX.copy()
capabilities["pageLoadStrategy"] = "normal"
firefox_options._caps = capabilities
try:
driver = webdriver.Firefox(
firefox_profile=firefox_profile,
firefox_binary=firefox_binary,
options=firefox_options,
desired_capabilities=capabilities,
)
except Exception as e:
cls.log_response("_get_driver", 500, "Crash: {}".format(e))
cls.log_response("_get_driver", 500, traceback.format_exc())
return driver
def _login_scraper_user(cls, driver, scraper_account):
driver.implicitly_wait(5)
driver.get(TWITTER_URL_BASE)
WebDriverWait(driver, 10).until(
lambda dr: dr.execute_script("return document.readyState") == "complete"
)
time.sleep(SELENIUM_INSTANCE_WAIT)
username_inputs = driver.find_elements_by_css_selector("input[name='text']")
if not username_inputs:
return False
username_input_parent = (
username_inputs[0].find_element_by_xpath("..").find_element_by_xpath("..")
)
username_input_parent.click()
time.sleep(SELENIUM_INSTANCE_WAIT)
username_inputs[0].click()
time.sleep(SELENIUM_INSTANCE_WAIT)
username_inputs[0].send_keys(scraper_account["username"])
time.sleep(SELENIUM_INSTANCE_WAIT)
next_buttons = driver.find_elements_by_xpath('//span[text()="Next"]')
if not next_buttons:
return False
next_buttons[0].click()
time.sleep(SELENIUM_INSTANCE_WAIT)
password_inputs = driver.find_elements_by_css_selector("input[name='password']")
if not password_inputs:
return False
password_input_parent = (
password_inputs[0].find_element_by_xpath("..").find_element_by_xpath("..")
)
password_input_parent.click()
time.sleep(SELENIUM_INSTANCE_WAIT)
password_inputs[0].click()
time.sleep(SELENIUM_INSTANCE_WAIT)
password_inputs[0].send_keys(scraper_account["password"])
time.sleep(SELENIUM_INSTANCE_WAIT)
login_buttons = driver.find_elements_by_xpath('//span[text()="Log in"]')
if not login_buttons:
return False
login_buttons[0].click()
time.sleep(SELENIUM_INSTANCE_WAIT)
if driver.find_elements_by_xpath(
'//span[text()="Boost your account security"]'
):
close_buttons = driver.find_elements_by_css_selector(
"div[data-testid='app-bar-close']"
)
if not close_buttons:
return False
close_buttons[0].click()
driver.implicitly_wait(0)
return True
This is an old version of Selenium because the server lags behind due to technical debt (it's an IaaS). I'm using the same ancient Selenium, however my Firefox is fresh.
Just a little follow-up: The whole API pricing -> scraping (I predicted this on the 1st of May) -> scrape prevention fight unnecessarily reached the user level: https://www.cnbc.com/2023/07/03/users-flock-to-twitter-competitor-bluesky-after-elon-musk-imposes-rate-limits.html Congratulations!