From cdd7281f346d218a44e08a303fc860050c557305 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Sat, 7 Mar 2026 13:35:31 -0500 Subject: [PATCH 1/4] Add Firebase authentication support - Integrated Firebase for user login and signup using Firebase ID tokens. - Updated login and signup mutations to handle Firebase authentication. - Modified user schema to reflect changes in authentication method. - Updated .gitignore to include Firebase service account key. --- .gitignore | 3 ++- app.py | 21 +++++++++++++++++++++ requirements.txt | 1 + src/mutations/login_user.py | 15 +++++++++++---- src/mutations/signup_user.py | 24 ++++++++++++++++-------- src/schema.py | 6 ++++-- 6 files changed, 55 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 7e14a5b..76f90e5 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ __pycache__/ .env .envrc .DS_Store -ca-certificate.crt \ No newline at end of file +ca-certificate.crt +firebase-service-account-key.json \ No newline at end of file diff --git a/app.py b/app.py index 9720900..a1fa4df 100644 --- a/app.py +++ b/app.py @@ -23,6 +23,27 @@ from src.utils.team_loader import TeamLoader from src.database import db +import os +import firebase_admin +from firebase_admin import credentials, auth + +SERVICE_ACCOUNT_PATH = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") + + +def initialize_firebase(): + if not firebase_admin._apps: + if not SERVICE_ACCOUNT_PATH: + raise ValueError( + "GOOGLE_APPLICATION_CREDENTIALS is not set. Set it to your firebase-service-account-key.json path." + ) + cred = credentials.Certificate(SERVICE_ACCOUNT_PATH) + firebase_admin.initialize_app(cred) + logging.info("Firebase app initialized.") + return firebase_admin.get_app() + + +initialize_firebase() + app = Flask(__name__) # CORS: allow frontend (different origin) to call this API diff --git a/requirements.txt b/requirements.txt index 6ef629f..fb220c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ Flask-APScheduler python-dotenv pytz gunicorn +firebase-admin \ No newline at end of file diff --git a/src/mutations/login_user.py b/src/mutations/login_user.py index b606738..b3bece6 100644 --- a/src/mutations/login_user.py +++ b/src/mutations/login_user.py @@ -1,19 +1,26 @@ from graphql import GraphQLError -from graphene import Mutation, String, Field +from graphene import Mutation, String +from firebase_admin import auth as firebase_auth from flask_jwt_extended import create_access_token, create_refresh_token from src.database import db class LoginUser(Mutation): class Arguments: - net_id = String(required=True, description="User's net ID (e.g. Cornell netid).") + id_token = String(required=True, description="Firebase ID token from the client.") access_token = String() refresh_token = String() - def mutate(self, info, net_id): - user = db["users"].find_one({"net_id": net_id}) + def mutate(self, info, id_token): + try: + decoded = firebase_auth.verify_id_token(id_token) + except Exception: + raise GraphQLError("Invalid or expired token.") + + firebase_uid = decoded["uid"] + user = db["users"].find_one({"firebase_uid": firebase_uid}) if not user: raise GraphQLError("User not found.") identity = str(user["_id"]) diff --git a/src/mutations/signup_user.py b/src/mutations/signup_user.py index eb6f6ae..4ff60b6 100644 --- a/src/mutations/signup_user.py +++ b/src/mutations/signup_user.py @@ -1,30 +1,38 @@ from graphql import GraphQLError from graphene import Mutation, String +from firebase_admin import auth as firebase_auth from flask_jwt_extended import create_access_token, create_refresh_token from src.database import db class SignupUser(Mutation): class Arguments: - net_id = String(required=True, description="User's net ID (e.g. Cornell netid).") + id_token = String(required=True, description="Firebase ID token from the client.") name = String(required=False, description="Display name.") - email = String(required=False, description="Email address.") + email = String(required=False, description="Email (overrides token email if provided).") access_token = String() refresh_token = String() - def mutate(self, info, net_id, name=None, email=None): - if db["users"].find_one({"net_id": net_id}): - raise GraphQLError("Net ID already exists.") + def mutate(self, info, id_token, name=None, email=None): + try: + decoded = firebase_auth.verify_id_token(id_token) + except Exception: + raise GraphQLError("Invalid or expired token.") + + firebase_uid = decoded["uid"] + if db["users"].find_one({"firebase_uid": firebase_uid}): + raise GraphQLError("User already exists.") + + email = email or decoded.get("email") user_doc = { - "net_id": net_id, + "firebase_uid": firebase_uid, + "email": email, "favorite_game_ids": [], } if name is not None: user_doc["name"] = name - if email is not None: - user_doc["email"] = email result = db["users"].insert_one(user_doc) identity = str(result.inserted_id) return SignupUser( diff --git a/src/schema.py b/src/schema.py index 70b5473..3bda2d0 100644 --- a/src/schema.py +++ b/src/schema.py @@ -31,9 +31,11 @@ class Mutation(ObjectType): create_team = CreateTeam.Field(description="Creates a new team.") create_youtube_video = CreateYoutubeVideo.Field(description="Creates a new youtube video.") create_article = CreateArticle.Field(description="Creates a new article.") - login_user = LoginUser.Field(description="Login by net_id; returns access_token and refresh_token.") + login_user = LoginUser.Field( + description="Login with Firebase ID token; returns access_token and refresh_token.", + ) signup_user = SignupUser.Field( - description="Create a new user by net_id; returns access_token and refresh_token (no separate login needed).", + description="Create a new user with Firebase ID token; returns access_token and refresh_token.", ) refresh_access_token = RefreshAccessToken.Field( description="Exchange a valid refresh token (in Authorization header) for a new access_token.", From f19c92ece34a0dfdda94db959cd1eece248f9108 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Wed, 25 Mar 2026 17:52:24 -0400 Subject: [PATCH 2/4] Add softball summary extraction and enhance database indexing - Introduced a new function to extract scoring summaries for softball games, including inning and score details. - Updated the database index setup to ensure uniqueness of game entries by including the time field. - Added an 'inning' field to the BoxScoreEntryType for better data representation. --- src/database.py | 3 +++ src/scrapers/game_details_scrape.py | 32 +++++++++++++++++++++++++++-- src/scrapers/games_scraper.py | 13 ++++++------ src/types.py | 1 + 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/database.py b/src/database.py index 20f1331..25ccd39 100644 --- a/src/database.py +++ b/src/database.py @@ -70,6 +70,7 @@ def setup_database_indexes(): game_collection.create_index([("date", -1)], background=True) try: + # Ensure doubleheaders on the same day remain distinct by including `time`. game_collection.create_index( [ ("sport", 1), @@ -79,8 +80,10 @@ def setup_database_indexes(): ("city", 1), ("state", 1), ("location", 1), + ("time", 1), ], unique=True, + name="uniq_game_key_with_time", background=True ) except (DuplicateKeyError, OperationFailure) as e: diff --git a/src/scrapers/game_details_scrape.py b/src/scrapers/game_details_scrape.py index 5f2f3b1..d8c6a93 100644 --- a/src/scrapers/game_details_scrape.py +++ b/src/scrapers/game_details_scrape.py @@ -53,6 +53,34 @@ def extract_teams_and_scores(box_score_section, sport): return team_names, period_scores +def softball_summary(box_score_section): + summary = [] + scoring_section = box_score_section.find(TAG_SECTION, {ATTR_ARIA_LABEL: LABEL_SCORING_SUMMARY}) + if scoring_section: + scoring_rows = scoring_section.find(TAG_TBODY) + if scoring_rows: + for row in scoring_rows.find_all(TAG_TR): + team = row.find_all(TAG_TD)[0].find(TAG_IMG)[ATTR_ALT] + inning = row.find_all(TAG_TD)[3].text.strip() + desc_cell = row.find_all(TAG_TD)[4] + span = desc_cell.find(TAG_SPAN) + if span: + span.extract() + desc = desc_cell.get_text(strip=True) + cornell_score = int(row.find_all(TAG_TD)[5].get_text(strip=True) or 0) + opp_score = int(row.find_all(TAG_TD)[6].get_text(strip=True) or 0) + summary.append({ + 'team': team, + 'inning': inning, + 'description': desc, + 'cor_score': cornell_score, + 'opp_score': opp_score + }) + if not summary: + summary = [{"message": "No scoring events in this game."}] + return summary + + def soccer_summary(box_score_section): summary = [] scoring_section = box_score_section.find(TAG_SECTION, {ATTR_ARIA_LABEL: LABEL_SCORING_SUMMARY}) @@ -124,14 +152,13 @@ def hockey_summary(box_score_section): scorer = row.find_all(TAG_TD)[4].text.strip() assist = row.find_all(TAG_TD)[5].text.strip() - if team == "COR" or team == "CU" or team == "Cornell": + if team == "COR" or team == "CU" or team == "Cornell" or team == "CORNELL": cornell_score += 1 else: opp_score += 1 summary.append({ 'team': team, - 'period': period, 'time': time, 'scorer': scorer, 'assist': assist, @@ -272,6 +299,7 @@ def scrape_game(url, sport): 'field hockey': (lambda: extract_teams_and_scores(box_score_section, 'field hockey'), field_hockey_summary), 'lacrosse': (lambda: extract_teams_and_scores(box_score_section, 'lacrosse'), lacrosse_summary), 'baseball': (lambda: extract_teams_and_scores(box_score_section, 'baseball'), baseball_summary), + 'softball': (lambda: extract_teams_and_scores(box_score_section, 'softball'), softball_summary), 'basketball': (lambda: extract_teams_and_scores(box_score_section, 'basketball'), lambda _: []), } diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index 818760c..547df44 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -125,7 +125,8 @@ def parse_schedule_page(url, sport, gender): result_tag = game_item.select_one(RESULT_TAG) if result_tag: - game_data["result"] = result_tag.text.strip().replace("\n", "") + #game_data["result"] = result_tag.get_text(" ", strip=True) + game_data["result"] = result_tag.text.strip().replace("\n", " ") else: game_data["result"] = None @@ -241,17 +242,16 @@ def process_game_data(game_data): if str(final_box_cor_score) != str(cor_final) or str(final_box_opp_score) != str(opp_final): game_data["score_breakdown"] = game_data["score_breakdown"][::-1] - # Try to find by tournament key fields to handle placeholder teams + # Try to find an existing game record to update. curr_game = GameService.get_game_by_tournament_key_fields( city, game_data["date"], game_data["gender"], location, game_data["sport"], - state + state, ) - - # If no tournament game found, try the regular lookup with opponent_id + if not curr_game: curr_game = GameService.get_game_by_key_fields( city, @@ -260,7 +260,7 @@ def process_game_data(game_data): location, team.id, game_data["sport"], - state + state, ) if isinstance(curr_game, list): @@ -268,6 +268,7 @@ def process_game_data(game_data): curr_game = curr_game[0] else: curr_game = None + if curr_game: updates = { "time": game_time, diff --git a/src/types.py b/src/types.py index 7eb8fbe..cef15c2 100644 --- a/src/types.py +++ b/src/types.py @@ -54,6 +54,7 @@ class BoxScoreEntryType(ObjectType): team = String(required=False) period = String(required=False) + inning = String(required=False) time = String(required=False) description = String(required=False) scorer = String(required=False) From f2c5a569ec08c708f3995b84955869409eacdcd6 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Wed, 15 Apr 2026 14:54:35 -0400 Subject: [PATCH 3/4] Enhance Firebase integration, update environment configuration, fix softball scraper with coderabbit feedback - Added new environment variables for Google and Firebase credentials in .env_template. - Updated Docker configuration to mount Firebase service account key. - Improved error handling in login and signup mutations for better token validation. - Enhanced database indexing for users to ensure unique Firebase UID entries. - Improved softball scraper logic --- .env_template | 4 ++- app.py | 12 +------- docker-compose.yml | 3 ++ requirements.txt | 2 +- src/database.py | 7 +++++ src/mutations/login_user.py | 16 ++++++++-- src/mutations/signup_user.py | 24 +++++++++++---- src/repositories/game_repository.py | 46 +++++++++++++++++++---------- src/scrapers/game_details_scrape.py | 4 ++- src/scrapers/games_scraper.py | 6 ++-- src/services/game_service.py | 10 +++---- src/types.py | 3 +- 12 files changed, 90 insertions(+), 47 deletions(-) diff --git a/.env_template b/.env_template index 56b5add..7c5f29d 100644 --- a/.env_template +++ b/.env_template @@ -2,4 +2,6 @@ YOUTUBE_API_KEY= MONGO_URI= MONGO_DB= STAGE= -DAILY_SUN_URL= \ No newline at end of file +DAILY_SUN_URL= +GOOGLE_APPLICATION_CREDENTIALS= +FIREBASE_CREDENTIALS_HOST_PATH=./firebase-service-account-key.json \ No newline at end of file diff --git a/app.py b/app.py index a1fa4df..2e113b4 100644 --- a/app.py +++ b/app.py @@ -25,7 +25,7 @@ import os import firebase_admin -from firebase_admin import credentials, auth +from firebase_admin import credentials SERVICE_ACCOUNT_PATH = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") @@ -164,16 +164,6 @@ def signal_handler(sig, frame): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) -# Only parse arguments when running directly (not when imported by gunicorn) -if __name__ == "__main__": - args = parse_args() -else: - # Default args when imported by gunicorn - class DefaultArgs: - no_scrape = False - no_daily_sun = False - args = DefaultArgs() - # Only run scraping tasks if not disabled if not args.no_scrape: from flask_apscheduler import APScheduler diff --git a/docker-compose.yml b/docker-compose.yml index 339a5f2..ce81025 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,10 +4,13 @@ services: app: image: cornellappdev/score-dev:${IMAGE_TAG} env_file: .env + environment: + GOOGLE_APPLICATION_CREDENTIALS: /app/secrets/firebase.json ports: - "8000:8000" volumes: - ./ca-certificate.crt:/etc/ssl/ca-certificate.crt:ro # Mount MongoDB cert inside the container, ro for read only + - ${FIREBASE_CREDENTIALS_HOST_PATH:-./firebase-service-account-key.json}:/app/secrets/firebase.json:ro scraper: image: cornellappdev/score-dev:${IMAGE_TAG} diff --git a/requirements.txt b/requirements.txt index fb220c9..f4df598 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ Flask-APScheduler python-dotenv pytz gunicorn -firebase-admin \ No newline at end of file +firebase-admin==7.3.0 \ No newline at end of file diff --git a/src/database.py b/src/database.py index 25ccd39..70d5f4b 100644 --- a/src/database.py +++ b/src/database.py @@ -104,6 +104,13 @@ def setup_database_indexes(): # JWT blocklist: fast lookup by jti db["token_blocklist"].create_index([("jti", 1)], background=True) + try: + db["users"].create_index( + [("firebase_uid", 1)], unique=True, sparse=True, background=True + ) + except (DuplicateKeyError, OperationFailure) as e: + print(f"Warning: Could not create unique index on users.firebase_uid: {e}") + print("✅ MongoDB indexes created successfully") except Exception as e: print(f"❌ Failed to create MongoDB indexes: {e}") diff --git a/src/mutations/login_user.py b/src/mutations/login_user.py index b3bece6..91591c3 100644 --- a/src/mutations/login_user.py +++ b/src/mutations/login_user.py @@ -5,6 +5,12 @@ from flask_jwt_extended import create_access_token, create_refresh_token from src.database import db +_TOKEN_ERRORS = ( + firebase_auth.InvalidIdTokenError, + firebase_auth.ExpiredIdTokenError, + firebase_auth.RevokedIdTokenError, +) + class LoginUser(Mutation): class Arguments: @@ -16,10 +22,14 @@ class Arguments: def mutate(self, info, id_token): try: decoded = firebase_auth.verify_id_token(id_token) - except Exception: - raise GraphQLError("Invalid or expired token.") + except _TOKEN_ERRORS as err: + raise GraphQLError("Invalid or expired token.") from err + except ValueError as err: + raise GraphQLError("Invalid or expired token.") from err - firebase_uid = decoded["uid"] + firebase_uid = decoded.get("uid") + if not firebase_uid: + raise GraphQLError("Invalid or expired token.") user = db["users"].find_one({"firebase_uid": firebase_uid}) if not user: raise GraphQLError("User not found.") diff --git a/src/mutations/signup_user.py b/src/mutations/signup_user.py index 4ff60b6..d4e530d 100644 --- a/src/mutations/signup_user.py +++ b/src/mutations/signup_user.py @@ -3,8 +3,15 @@ from firebase_admin import auth as firebase_auth from flask_jwt_extended import create_access_token, create_refresh_token +from pymongo.errors import DuplicateKeyError from src.database import db +_TOKEN_ERRORS = ( + firebase_auth.InvalidIdTokenError, + firebase_auth.ExpiredIdTokenError, + firebase_auth.RevokedIdTokenError, +) + class SignupUser(Mutation): class Arguments: @@ -18,12 +25,14 @@ class Arguments: def mutate(self, info, id_token, name=None, email=None): try: decoded = firebase_auth.verify_id_token(id_token) - except Exception: - raise GraphQLError("Invalid or expired token.") + except _TOKEN_ERRORS as err: + raise GraphQLError("Invalid or expired token.") from err + except ValueError as err: + raise GraphQLError("Invalid or expired token.") from err - firebase_uid = decoded["uid"] - if db["users"].find_one({"firebase_uid": firebase_uid}): - raise GraphQLError("User already exists.") + firebase_uid = decoded.get("uid") + if firebase_uid is None: + raise GraphQLError("Token missing uid") from KeyError("uid") email = email or decoded.get("email") user_doc = { @@ -33,7 +42,10 @@ def mutate(self, info, id_token, name=None, email=None): } if name is not None: user_doc["name"] = name - result = db["users"].insert_one(user_doc) + try: + result = db["users"].insert_one(user_doc) + except DuplicateKeyError as err: + raise GraphQLError("User already exists.") from err identity = str(result.inserted_id) return SignupUser( access_token=create_access_token(identity=identity), diff --git a/src/repositories/game_repository.py b/src/repositories/game_repository.py index e531286..5de3269 100644 --- a/src/repositories/game_repository.py +++ b/src/repositories/game_repository.py @@ -11,6 +11,16 @@ logger = logging.getLogger(__name__) +def _time_for_lookup(time): + """True when `time` should be included in a query (aligned with uniq_game_key_with_time).""" + if time is None: + return False + s = str(time).strip() + if not s or s in ("TBD", "TBA"): + return False + return True + + class GameRepository: @staticmethod def find_all(limit=100, offset=0): @@ -103,24 +113,26 @@ def find_by_data(city, date, gender, location, opponent_id, sport, state, time): return Game.from_dict(game_data) if game_data else None @staticmethod - def find_by_key_fields(city, date, gender, location, opponent_id, sport, state): + def find_by_key_fields(city, date, gender, location, opponent_id, sport, state, time=None): """ - Find games without time for duplicate games + Find games by key fields. When `time` is a concrete value (not TBD/TBA), the query + includes it so doubleheaders resolve to a single row. Otherwise falls back to the + legacy filter without time (multiple rows possible). """ game_collection = db["game"] - games = list( - game_collection.find( - { - "city": city, - "date": date, - "gender": gender, - "location": location, - "opponent_id": opponent_id, - "sport": sport, - "state": state, - } - ) - ) + base = { + "city": city, + "date": date, + "gender": gender, + "location": location, + "opponent_id": opponent_id, + "sport": sport, + "state": state, + } + if _time_for_lookup(time): + games = list(game_collection.find({**base, "time": time})) + else: + games = list(game_collection.find(base)) if not games: return None @@ -131,7 +143,7 @@ def find_by_key_fields(city, date, gender, location, opponent_id, sport, state): return [Game.from_dict(game) for game in games] @staticmethod - def find_by_tournament_key_fields(city, date, gender, location, sport, state): + def find_by_tournament_key_fields(city, date, gender, location, sport, state, time=None): """ Find tournament games by location and date (excluding opponent_id). This is used when we need to find a tournament game that might have a placeholder team. @@ -145,6 +157,8 @@ def find_by_tournament_key_fields(city, date, gender, location, sport, state): "gender": gender, "sport": sport, } + if _time_for_lookup(time): + query["time"] = time # For city, state, and location, use flexible matching # This allows finding games even when TBD/TBA values change to real values diff --git a/src/scrapers/game_details_scrape.py b/src/scrapers/game_details_scrape.py index d8c6a93..0f5a07c 100644 --- a/src/scrapers/game_details_scrape.py +++ b/src/scrapers/game_details_scrape.py @@ -70,7 +70,8 @@ def softball_summary(box_score_section): cornell_score = int(row.find_all(TAG_TD)[5].get_text(strip=True) or 0) opp_score = int(row.find_all(TAG_TD)[6].get_text(strip=True) or 0) summary.append({ - 'team': team, + 'team': team, + 'period': inning, 'inning': inning, 'description': desc, 'cor_score': cornell_score, @@ -159,6 +160,7 @@ def hockey_summary(box_score_section): summary.append({ 'team': team, + 'period': period, 'time': time, 'scorer': scorer, 'assist': assist, diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index 547df44..3392ee8 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -125,8 +125,8 @@ def parse_schedule_page(url, sport, gender): result_tag = game_item.select_one(RESULT_TAG) if result_tag: - #game_data["result"] = result_tag.get_text(" ", strip=True) - game_data["result"] = result_tag.text.strip().replace("\n", " ") + raw = result_tag.get_text(" ", strip=True) + game_data["result"] = re.sub(r"\s+", " ", raw).strip() else: game_data["result"] = None @@ -250,6 +250,7 @@ def process_game_data(game_data): location, game_data["sport"], state, + game_time, ) if not curr_game: @@ -261,6 +262,7 @@ def process_game_data(game_data): team.id, game_data["sport"], state, + game_time, ) if isinstance(curr_game, list): diff --git a/src/services/game_service.py b/src/services/game_service.py index 6fd3479..6580aea 100644 --- a/src/services/game_service.py +++ b/src/services/game_service.py @@ -74,22 +74,22 @@ def get_game_by_data(city, date, gender, location, opponent_id, sport, state, ti ) @staticmethod - def get_game_by_key_fields(city, date, gender, location, opponent_id, sport, state): + def get_game_by_key_fields(city, date, gender, location, opponent_id, sport, state, time=None): """ - Retrieve a game by its essential fields, ignoring time + Retrieve game(s) by key fields. Pass `time` when known so doubleheaders match one row. """ return GameRepository.find_by_key_fields( - city, date, gender, location, opponent_id, sport, state + city, date, gender, location, opponent_id, sport, state, time ) @staticmethod - def get_game_by_tournament_key_fields(city, date, gender, location, sport, state): + def get_game_by_tournament_key_fields(city, date, gender, location, sport, state, time=None): """ Retrieve a tournament game by location and date (excluding opponent_id). This is used when we need to find a tournament game that might have a placeholder team. """ return GameRepository.find_by_tournament_key_fields( - city, date, gender, location, sport, state + city, date, gender, location, sport, state, time ) @staticmethod diff --git a/src/types.py b/src/types.py index cef15c2..15ce333 100644 --- a/src/types.py +++ b/src/types.py @@ -42,7 +42,8 @@ class BoxScoreEntryType(ObjectType): Attributes: - `team`: The team involved in the scoring event. - - `period`: The period or inning of the event. + - `period`: The period of the event (e.g. hockey period). + - `inning`: The inning of the event (e.g. baseball/softball). - `time`: The time of the scoring event. - `description`: A description of the play or scoring event. - `scorer`: The name of the scorer. From 58052679a5835319d33910270e355a9ed7a07033 Mon Sep 17 00:00:00 2001 From: claiireyu Date: Wed, 22 Apr 2026 14:58:04 -0400 Subject: [PATCH 4/4] Added recap link scraping for sports that do not have box scores - Added optional fields for game recap links, article titles, and published dates in the GameType and Game models. - Updated CreateGame mutation to accept new recap fields. - Implemented a new scraping function to extract recap headlines and published times from Cornell Sidearm story pages. --- src/models/game.py | 15 ++++++ src/mutations/create_game.py | 13 ++++- src/scrapers/game_details_scrape.py | 41 +++++++++++++- src/scrapers/games_scraper.py | 83 +++++++++++++++++++++-------- src/types.py | 13 ++++- src/utils/constants.py | 31 +++++++++++ 6 files changed, 169 insertions(+), 27 deletions(-) diff --git a/src/models/game.py b/src/models/game.py index 73a7968..e2e79e5 100644 --- a/src/models/game.py +++ b/src/models/game.py @@ -18,6 +18,9 @@ class Game: - `box_score` The scoring summary of the game (optional) - `score_breakdown` The scoring breakdown of the game (optional) - 'ticket_link' The ticket link for the game (optional) + - 'recap_link' The recap/details link for the game (optional) + - 'recap_article_title' Title from the recap/story page when scraped (optional) + - 'recap_published_at' Published date/time string from the recap page (optional) """ def __init__( @@ -37,6 +40,9 @@ def __init__( team=None, utc_date=None, ticket_link=None, + recap_link=None, + recap_article_title=None, + recap_published_at=None, ): self.id = id if id else str(ObjectId()) self.city = city @@ -53,6 +59,9 @@ def __init__( self.team = team self.utc_date = utc_date self.ticket_link = ticket_link + self.recap_link = recap_link + self.recap_article_title = recap_article_title + self.recap_published_at = recap_published_at def to_dict(self): """ @@ -74,6 +83,9 @@ def to_dict(self): "team": self.team, "utc_date": self.utc_date, "ticket_link": self.ticket_link, + "recap_link": self.recap_link, + "recap_article_title": self.recap_article_title, + "recap_published_at": self.recap_published_at, } @staticmethod @@ -97,4 +109,7 @@ def from_dict(data) -> None: team=data.get("team"), utc_date=data.get("utc_date"), ticket_link=data.get("ticket_link"), + recap_link=data.get("recap_link"), + recap_article_title=data.get("recap_article_title"), + recap_published_at=data.get("recap_published_at"), ) diff --git a/src/mutations/create_game.py b/src/mutations/create_game.py index 3a52345..28d2ce6 100644 --- a/src/mutations/create_game.py +++ b/src/mutations/create_game.py @@ -18,6 +18,9 @@ class Arguments: score_breakdown = String(required=False) utc_date = String(required=False) ticket_link = String(required=False) + recap_link = String(required=False) + recap_article_title = String(required=False) + recap_published_at = String(required=False) game = Field(lambda: GameType) @@ -36,7 +39,10 @@ def mutate( box_score=None, score_breakdown=None, utc_date=None, - ticket_link=None + ticket_link=None, + recap_link=None, + recap_article_title=None, + recap_published_at=None, ): game_data = { "city": city, @@ -51,7 +57,10 @@ def mutate( "box_score": box_score, "score_breakdown": score_breakdown, "utc_date": utc_date, - "ticket_link": ticket_link + "ticket_link": ticket_link, + "recap_link": recap_link, + "recap_article_title": recap_article_title, + "recap_published_at": recap_published_at, } new_game = GameService.create_game(game_data) return CreateGame(game=new_game) \ No newline at end of file diff --git a/src/scrapers/game_details_scrape.py b/src/scrapers/game_details_scrape.py index 0f5a07c..7ce53d7 100644 --- a/src/scrapers/game_details_scrape.py +++ b/src/scrapers/game_details_scrape.py @@ -22,9 +22,46 @@ def clean_name(name): return cleaned def fetch_page(url): - response = requests.get(url) + response = requests.get(url, headers=HTTP_REQUEST_HEADERS, timeout=20) return BeautifulSoup(response.text, 'html.parser') + +def scrape_sidearm_story_recap(url): + """ + Extract headline and published time from a Cornell Sidearm story/recap page + """ + if not url: + return {} + try: + response = requests.get(url, headers=HTTP_REQUEST_HEADERS, timeout=20) + if response.status_code != 200: + return {} + soup = BeautifulSoup(response.text, "html.parser") + except Exception: + return {} + headline = soup.select_one(SIDEARM_STORY_HEADLINE) + time_el = soup.select_one(SIDEARM_STORY_PUBLISHED_TIME) + title = headline.get_text(strip=True) if headline else None + if not title: + og = soup.find("meta", property="og:title") + if og and og.get("content"): + title = og["content"].strip() + published_at = None + if time_el: + published_at = time_el.get_text(strip=True) + if not published_at and time_el.get("datetime"): + published_at = time_el["datetime"].strip() + if not published_at: + pmeta = soup.find("meta", property="article:published_time") + if pmeta and pmeta.get("content"): + published_at = pmeta["content"].strip() + out = {} + if title: + out["recap_article_title"] = title + if published_at: + out["recap_published_at"] = published_at + return out + def extract_teams_and_scores(box_score_section, sport): score_table = box_score_section.find(TAG_TABLE, class_=CLASS_SIDEARM_TABLE) team_names = [] @@ -258,6 +295,7 @@ def baseball_summary(box_score_section): summary = [{"message": "No scoring events in this game."}] return summary + # def basketball_summary(box_score_section): # summary = [] # scoring_section = box_score_section.find(TAG_SECTION, {ATTR_ARIA_LABEL: LABEL_SCORING_SUMMARY}) @@ -303,6 +341,7 @@ def scrape_game(url, sport): 'baseball': (lambda: extract_teams_and_scores(box_score_section, 'baseball'), baseball_summary), 'softball': (lambda: extract_teams_and_scores(box_score_section, 'softball'), softball_summary), 'basketball': (lambda: extract_teams_and_scores(box_score_section, 'basketball'), lambda _: []), + } extract_teams_func, summary_func = sport_parsers.get(sport, (None, None)) diff --git a/src/scrapers/games_scraper.py b/src/scrapers/games_scraper.py index 3392ee8..01af4ad 100644 --- a/src/scrapers/games_scraper.py +++ b/src/scrapers/games_scraper.py @@ -3,7 +3,7 @@ from src.services import GameService, TeamService from src.utils.convert_to_utc import convert_to_utc from src.utils.constants import * -from src.scrapers.game_details_scrape import scrape_game +from src.scrapers.game_details_scrape import scrape_game, scrape_sidearm_story_recap from src.utils.helpers import get_dominant_color, normalize_game_data, is_tournament_placeholder_team, is_cornell_loss import base64 import re @@ -40,6 +40,38 @@ def infer_game_year(date_text, season_years): return second_year return first_year + +def to_absolute_url(link): + """Convert relative Cornell links like /news/... to absolute URLs.""" + if not link: + return None + if link.startswith("http://") or link.startswith("https://"): + return link + return f"{BASE_URL.rstrip('/')}/{link.lstrip('/')}" + + +def parse_game_links(game_item): + """ + Parse link info for a schedule item. + Returns (recap_link, ticket_link). + """ + recap_link = None + box_score_tag = game_item.select_one(BOX_SCORE_TAG) + if box_score_tag: + box_score_link = box_score_tag.get("href") + recap_link = to_absolute_url(box_score_link) + + # Many sports expose recap links here when there is no separate box score link. + if not recap_link: + recap_tag = game_item.select_one(RECAP_TAG) + if recap_tag: + recap_link = to_absolute_url(recap_tag.get("href")) + + ticket_link_tag = game_item.select_one(GAME_TICKET_LINK) + ticket_link = to_absolute_url(ticket_link_tag["href"]) if ticket_link_tag else None + + return recap_link, ticket_link + def fetch_game_schedule(): """ Scrape the game schedule from the given URLs in parallel using threads. @@ -71,7 +103,7 @@ def parse_schedule_page(url, sport, gender): sport (str): The sport of the games. gender (str): The gender of the games. """ - response = requests.get(url) + response = requests.get(url, headers=HTTP_REQUEST_HEADERS, timeout=30) soup = BeautifulSoup(response.content, "html.parser") page_title = soup.title.text.strip() if soup.title else "" @@ -130,11 +162,15 @@ def parse_schedule_page(url, sport, gender): else: game_data["result"] = None - box_score_tag = game_item.select_one(BOX_SCORE_TAG) - if box_score_tag: - box_score_link = box_score_tag["href"] - game_details = scrape_game(f"{BASE_URL}{box_score_link}", sport.lower()) - if game_details.get('error') == 'Sport parser not found': + recap_link, ticket_link = parse_game_links(game_item) + + # These sports use news/recap pages instead of parsable box-score HTML. + if sport in SPORTS_WITH_SIDEARM_STORY_RECAP or not recap_link: + game_data["box_score"] = None + game_data["score_breakdown"] = None + else: + game_details = scrape_game(recap_link, sport.lower()) + if game_details.get("error") == "Sport parser not found": game_data["box_score"] = None game_data["score_breakdown"] = None else: @@ -145,23 +181,17 @@ def parse_schedule_page(url, sport, gender): location_data = game_data["location"].split("\n") if game_data["location"] else [""] geo_location = location_data[0] is_home_game = "Ithaca" in geo_location - + if is_home_game and game_data["box_score"]: for event in game_data["box_score"]: if "cor_score" in event and "opp_score" in event: event["cor_score"], event["opp_score"] = event["opp_score"], event["cor_score"] - else: - game_data["box_score"] = None - game_data["score_breakdown"] = None - - ticket_link_tag = game_item.select_one(GAME_TICKET_LINK) - ticket_link = ( - ticket_link_tag["href"] if ticket_link_tag else None - ) - game_data["ticket_link"] = ( - ticket_link if ticket_link else None - ) + if sport in SPORTS_WITH_SIDEARM_STORY_RECAP and recap_link: + game_data.update(scrape_sidearm_story_recap(recap_link)) + + game_data["ticket_link"] = ticket_link + game_data["recap_link"] = recap_link process_game_data(game_data) @@ -174,7 +204,8 @@ def process_game_data(game_data): """ game_data = normalize_game_data(game_data) - location_data = game_data["location"].split("\n") + location_raw = game_data.get("location") or "" + location_data = location_raw.split("\n") if location_raw else [""] geo_location = location_data[0] if (",") not in geo_location: city = geo_location @@ -281,8 +312,13 @@ def process_game_data(game_data): "city": city, "location": location, "state": state, - "ticket_link": game_data["ticket_link"] + "ticket_link": game_data["ticket_link"], + "recap_link": game_data.get("recap_link"), } + if "recap_article_title" in game_data: + updates["recap_article_title"] = game_data["recap_article_title"] + if "recap_published_at" in game_data: + updates["recap_published_at"] = game_data["recap_published_at"] current_team = TeamService.get_team_by_id(curr_game.opponent_id) if current_team and is_tournament_placeholder_team(current_team.name): @@ -307,7 +343,10 @@ def process_game_data(game_data): "box_score": game_data["box_score"], "score_breakdown": game_data["score_breakdown"], "utc_date": utc_date_str, - "ticket_link": game_data["ticket_link"] + "ticket_link": game_data["ticket_link"], + "recap_link": game_data.get("recap_link"), + "recap_article_title": game_data.get("recap_article_title"), + "recap_published_at": game_data.get("recap_published_at"), } GameService.create_game(game_data) \ No newline at end of file diff --git a/src/types.py b/src/types.py index 15ce333..0b26a2e 100644 --- a/src/types.py +++ b/src/types.py @@ -1,5 +1,4 @@ from graphene import ObjectType, Field, String, List, Int -from datetime import datetime class TeamType(ObjectType): """ @@ -91,6 +90,9 @@ class GameType(ObjectType): - `box_score`: The box score of the game. - `score_breakdown`: The score breakdown of the game. - `ticket_link`: The ticket link of the game. (optional) + - `recap_link`: The recap/details link of the game. (optional) + - `recap_article_title`: Headline from the recap story page when scraped (optional) + - `recap_published_at`: Published date/time from the recap story page (optional) """ id = String(required=False) @@ -108,8 +110,11 @@ class GameType(ObjectType): team = Field(TeamType, required=False) utc_date = String(required=False) ticket_link = String(required=False) + recap_link = String(required=False) + recap_article_title = String(required=False) + recap_published_at = String(required=False) def __init__( - self, id, city, date, gender, location, opponent_id, result, sport, state, time, box_score=None, score_breakdown=None, utc_date=None, ticket_link=None + self, id, city, date, gender, location, opponent_id, result, sport, state, time, box_score=None, score_breakdown=None, utc_date=None, ticket_link=None, recap_link=None, recap_article_title=None, recap_published_at=None ): self.id = id self.city = city @@ -125,6 +130,10 @@ def __init__( self.score_breakdown = score_breakdown self.utc_date = utc_date self.ticket_link = ticket_link + self.recap_link = recap_link + self.recap_article_title = recap_article_title + self.recap_published_at = recap_published_at + @staticmethod def team_to_team_type(team_obj): if team_obj is None: diff --git a/src/utils/constants.py b/src/utils/constants.py index 38c2ae7..ab6ea6c 100644 --- a/src/utils/constants.py +++ b/src/utils/constants.py @@ -9,6 +9,30 @@ # Base URL BASE_URL = "https://cornellbigred.com" +# cornellbigred.com often returns 404 or empty HTML for Python's default requests User-Agent. +HTTP_REQUEST_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", +} + +# Schedule rows for these sports link to Sidearm story pages (no box score); scrape recap headline/date the same way. +SPORTS_WITH_SIDEARM_STORY_RECAP = frozenset( + { + "Swimming & Diving", + "Track & Field", + "Wrestling", + "Golf", + "Polo", + "Fencing", + "Equestrian", + "Gymnastics" + } +) + # The tag for each game GAME_TAG = ".sidearm-schedule-game" @@ -45,9 +69,16 @@ # The tag for the box score BOX_SCORE_TAG = ".sidearm-schedule-game-links-boxscore a" +# The tag for recap/details links +RECAP_TAG = ".sidearm-schedule-game-links-recap a" + # The tag for the game ticket link GAME_TICKET_LINK = ".sidearm-schedule-game-links-tickets a" +# Sidearm full story recap article page +SIDEARM_STORY_HEADLINE = "h1.sidearm-story-template-headline" +SIDEARM_STORY_PUBLISHED_TIME = ".sidearm-story-template-date time" + # HTML Tags TAG_TABLE = 'table' TAG_SECTION = 'section'