Module mongo_auteur¶

`Auteur` ¶

Bases: BaseEntity

Source code in nbs/mongo_auteur.py

class Auteur(BaseEntity):
    collection: str = "auteurs"

    def __init__(self, nom: str) -> None:
        """Initialise une instance d'Auteur.

        Args:
            nom (str): Le nom de l'auteur.
        """
        super().__init__(nom, self.collection)

    def some_method(self):
        fmt_date = format_date(self.episode.date, "%Y/%m/%d")

`init(nom)` ¶

Initialise une instance d'Auteur.

Parameters:

Name	Type	Description	Default
`nom`	`str`	Le nom de l'auteur.	required

Source code in nbs/mongo_auteur.py

def __init__(self, nom: str) -> None:
    """Initialise une instance d'Auteur.

    Args:
        nom (str): Le nom de l'auteur.
    """
    super().__init__(nom, self.collection)

`AuthorChecker` ¶

Class to verify and correct an author's name using multiple data sources.

This class verifies an author in an episode through sources including

RSS metadata (title, description)
MongoDB database of known authors
LLM suggestions
Web search analysis

Source code in nbs/mongo_auteur.py

class AuthorChecker:
    """Class to verify and correct an author's name using multiple data sources.

    This class verifies an author in an episode through sources including:
      - RSS metadata (title, description)
      - MongoDB database of known authors
      - LLM suggestions
      - Web search analysis
    """

    def __init__(self, episode: Episode) -> None:
        """Initializes the AuthorChecker with an episode.

        Args:
            episode (Episode): An episode instance containing title and description.
        """
        self.episode = episode
        self.llm_structured_output = get_azure_llm("gpt-4o")
        self.authors_titre_description = self._get_authors_from_titre_description()

    def _get_filtered_titre_description(self, titre_or_description: str) -> str:
        """Filter the given titre or description to avoid Error 400.

        Filters out substrings that may trigger Azure OpenAI's content management policy (resulting in a 400 error).
        Specifically, for certain dates, predefined terms are replaced as specified in the filter mapping.

        For more details, see:
            https://github.com/castorfou/lmelp/issues/21

        Args:
            titre_or_description (str): 'titre' or 'description'depending on what to filter.

        Returns:
            str: The filtered titre or description.
        """
        filtering = {
            "2020/11/15": {"fossoyeur": "rigolo"},
        }
        fmt_date = self.episode.date.strftime("%Y/%m/%d")
        replacements = filtering.get(fmt_date)

        text = (
            self.episode.titre
            if titre_or_description == "titre"
            else self.episode.description
        )
        if replacements:
            for key, value in replacements.items():
                text = text.replace(key, value)
        return text

    def _get_authors_from_titre_description(self) -> List[str]:
        """Retrieves a list of author names extracted from the episode title and description using LLM.

        Returns:
            List[str]: A list of author names extracted from the title and description.
        """
        response_schema = {
            "type": "json_schema",
            "json_schema": {
                "name": "AuthorTitreDescriptionList",
                "schema": {
                    "type": "object",
                    "properties": {
                        "Authors_TitreDescription": {
                            "type": "array",
                            "items": {
                                "type": "string",
                                "description": "A list of names from title and description",
                            },
                        }
                    },
                    "required": ["Authors_TitreDescription"],
                    "additionalProperties": False,
                },
            },
        }
        try:
            titre = self._get_filtered_titre_description("titre")
            description = self._get_filtered_titre_description("description")
            response = self.llm_structured_output.chat(
                messages=[
                    ChatMessage(
                        role="system",
                        content="Tu es un assistant utile qui retourne une liste JSON de noms.",
                    ),
                    ChatMessage(
                        role="user",
                        content=f"Est-ce que tu peux me lister tous les noms qui sont cités dans le titre et la description de l'épisode suivant : {titre} {description}. ",
                    ),
                ],
                response_format=response_schema,
            )
        except Exception as e:
            print(f"Error getting authors from titre/description: {e}")
            print(f"prompt: {titre} {description}")
            return []
        try:
            json_dict = json.loads(response.message.content)
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            print("Raw response:", response.message.content)
            return []  # Return an empty list if parsing fails
        return json_dict["Authors_TitreDescription"]

    def _get_authors_from_llm(self, autor: str) -> List[str]:
        """Queries the LLM to retrieve a list of potential author names based on a provided name.

        Args:
            autor (str): The author name to query.

        Returns:
            List[str]: A list of author names suggested by the LLM.
        """
        response_schema = {
            "type": "json_schema",
            "json_schema": {
                "name": "AuthorList",
                "schema": {
                    "type": "object",
                    "properties": {
                        "Authors_LLM": {
                            "type": "array",
                            "items": {
                                "type": "string",
                                "description": "A list of authors' names",
                            },
                        }
                    },
                    "required": ["Authors_LLM"],
                    "additionalProperties": False,
                },
            },
        }

        prompt = (
            """
        Tu es un agent expert en littérature.
        Donne moi quelques auteurs dont le nom s'approche de celui-ci : """
            + autor
            + """

        S'il s'agit deja d'un auteur connu, retourne moi juste son nom. S'il y a une erreur dans le nom que je t'ai donne, corrige moi en me donnant le nom de l'auteur que tu penses que j'ai voulu dire.

        Je veux que tu me donnes le prenom puis le nom dans cet ordre. Par exemple "Marcel Pagnol" ou "Victor Hugo".
        Ces auteurs sont susceptibles d'etre discutes dans "Le Masque et la Plume".

        Si tu me retournes plusieurs auteurs, fais le sous forme de liste par exemple si tu as identifie "auteur 1" et "auteur 2" alors retourne ["auteur 1", "auteur 2"]
        """
        )

        response = self.llm_structured_output.chat(
            messages=[
                ChatMessage(
                    role="system",
                    content="Tu es un agent litteraire qui connait parfaitement les auteurs.",
                ),
                ChatMessage(role="user", content=f"{prompt}. "),
            ],
            response_format=response_schema,
        )

        try:
            json_dict = json.loads(response.message.content)
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            print("Raw response:", response.message.content)
            return []
        return json_dict["Authors_LLM"]

    def _get_author_from_web(self, author: str) -> Dict[str, Union[str, int]]:
        """Analyzes a Google search result to verify if a given name corresponds to an author.

        Args:
            author (str): The author name to verify.

        Returns:
            Dict[str, Union[str, int]]: A dictionary containing:
                - "auteur": The corrected author name if applicable.
                - "certitude": An integer between 0 and 100 indicating the confidence.
                - "analyse": A textual analysis of the Google search query.
        """
        result_google = google_search(author)

        prompt_incertitude_auteur = f"""
        Voici le resultat d'une requete google concernant un probable auteur inconnu de mon llm : {author}
        La requete est au format dict avec du json a l'interieur.
        Est-ce que tu peux analyser le contenu de cette requete et me dire si oui ou non {author} est un auteur de livres, 
        et accompagner ta reponse d'un pourcentage de certitude :
        * 100% de certitude signifie que tu es certain que {author} est un auteur de livres
        *  50% tu es ni sure ni pas sure que {author} est un auteur de livres
        *   0% tu es certain que {author} n'est pas un auteur de livres

        Voici le contenu de la requete google : {result_google}

        Tu repondras uniquement avec un dictionnaire qui va contenir 3 entrees :

        - "auteur" : le nom de l'auteur, eventuellement corrige si j'ai oublie des accents ou une faute de frappe
        - "certitude" : le pourcentage de certitude de 0 à 100, un entier
        - "analyse" : une analyse de la requete Google concernant l'auteur.
        """
        response_schema = {
            "type": "json_schema",
            "json_schema": {
                "name": "AuteurSchema",
                "schema": {
                    "type": "object",
                    "properties": {
                        "auteur": {
                            "type": "string",
                            "description": "Le nom de l'auteur éventuellement corrigé (accents, fautes de frappe).",
                        },
                        "certitude": {
                            "type": "integer",
                            "description": "Pourcentage de certitude (0 à 100).",
                            "minimum": 0,
                            "maximum": 100,
                        },
                        "analyse": {
                            "type": "string",
                            "description": "Analyse de la requête Google concernant l'auteur.",
                        },
                    },
                    "required": ["auteur", "certitude", "analyse"],
                    "additionalProperties": False,
                },
            },
        }

        response = self.llm_structured_output.chat(
            messages=[
                ChatMessage(
                    role="system",
                    content="Tu es un assistant utile qui analyse des requetes Google pour y deceler si un auteur de livre s'y cache.",
                ),
                ChatMessage(
                    role="user",
                    content=prompt_incertitude_auteur,
                ),
            ],
            response_format=response_schema,
        )
        try:
            json_dict = json.loads(response.message.content)
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            print("Raw response:", response.message.content)
            return {}
        return json_dict

    def _check_author_source(
        self, author: str, authors_list: List[str]
    ) -> Optional[str]:
        """Determines the best matching author from a provided list using fuzzy matching.

        Args:
            author (str): The author name to match.
            authors_list (List[str]): A list of author names to check against.

        Returns:
            Optional[str]: The best matching author name if the match score is above the threshold, otherwise None.
        """
        matcher = AuthorFuzzMatcher(authors_list)
        best_match, score = matcher.find_best_match(author)
        if score >= score_fuzz_threshold:
            return best_match
        else:
            return None

    def check_author(
        self, author: str, return_details: bool = False, verbose: bool = False
    ) -> Union[str, Dict[str, Union[str, int]], None]:
        """Verifies an author's name through various sources and returns the corrected name.

        It checks in the following order:
          1. RSS metadata (title, description)
          2. MongoDB list of known authors
          3. LLM suggested names
          4. Web search analysis

        Args:
            author (str): The author name to verify.
            return_details (bool, optional): If True, returns a detailed dictionary with source and analysis. Defaults to False.
            verbose (bool, optional): If True, prints debug messages. Defaults to False.

        Returns:
            Union[str, Dict[str, Union[str, int]], None]: The corrected author name as a string if return_details is False;
                a detailed dict if return_details is True; or None if no match is found.
        """
        details = {"author_original": author, "author_corrected": None, "source": None}

        # 1. Vérification dans rss:metadata (titre, description)
        match = self._check_author_source(author, self.authors_titre_description)
        if match:
            details["author_corrected"] = match
            details["source"] = "rss:metadata"
            if verbose:
                print(f"Trouvé avec rss:metadata: {match}")
            return details if return_details else match

        # 2. Vérification dans la base de données (mongodb:auteurs)
        list_db_auteurs = [auteur.nom for auteur in Auteur.get_entries()]
        match = self._check_author_source(author, list_db_auteurs)
        if match:
            details["author_corrected"] = match
            details["source"] = "mongodb:auteurs"
            if verbose:
                print(f"Trouvé avec mongodb:auteurs: {match}")
            return details if return_details else match

        # 3. Vérification via llm
        list_llm_auteurs = self._get_authors_from_llm(author)
        match = self._check_author_source(author, list_llm_auteurs)
        if match:
            details["author_corrected"] = match
            details["source"] = "llm"
            if verbose:
                print(f"Trouvé avec llm: {match}")
            return details if return_details else match

        # 4. Vérification via web search
        web_result_dict = self._get_author_from_web(author)
        match = web_result_dict.get("auteur")
        score = web_result_dict.get("certitude", 0)
        details.update(
            {
                "author_corrected": match,
                "score": score,
                "analyse": web_result_dict.get("analyse", ""),
                "source": "web search",
            }
        )
        if score >= score_fuzz_threshold:
            if verbose:
                print(f"Trouvé avec web search: {match}")
            return details if return_details else match
        else:
            if verbose:
                print(
                    f"Score insuffisant {score} avec web search: {web_result_dict.get('analyse', '')}"
                )
            details["author_corrected"] = None
            return details if return_details else None

`init(episode)` ¶

Initializes the AuthorChecker with an episode.

Parameters:

Name	Type	Description	Default
`episode`	`Episode`	An episode instance containing title and description.	required

Source code in nbs/mongo_auteur.py

def __init__(self, episode: Episode) -> None:
    """Initializes the AuthorChecker with an episode.

    Args:
        episode (Episode): An episode instance containing title and description.
    """
    self.episode = episode
    self.llm_structured_output = get_azure_llm("gpt-4o")
    self.authors_titre_description = self._get_authors_from_titre_description()

`check_author(author, return_details=False, verbose=False)` ¶

Verifies an author's name through various sources and returns the corrected name.

It checks in the following order

RSS metadata (title, description)
MongoDB list of known authors
LLM suggested names
Web search analysis

Parameters:

Name	Type	Description	Default
`author`	`str`	The author name to verify.	required
`return_details`	`bool`	If True, returns a detailed dictionary with source and analysis. Defaults to False.	`False`
`verbose`	`bool`	If True, prints debug messages. Defaults to False.	`False`

Returns:

Type	Description
`Union[str, Dict[str, Union[str, int]], None]`	Union[str, Dict[str, Union[str, int]], None]: The corrected author name as a string if return_details is False; a detailed dict if return_details is True; or None if no match is found.

Source code in nbs/mongo_auteur.py

def check_author(
    self, author: str, return_details: bool = False, verbose: bool = False
) -> Union[str, Dict[str, Union[str, int]], None]:
    """Verifies an author's name through various sources and returns the corrected name.

    It checks in the following order:
      1. RSS metadata (title, description)
      2. MongoDB list of known authors
      3. LLM suggested names
      4. Web search analysis

    Args:
        author (str): The author name to verify.
        return_details (bool, optional): If True, returns a detailed dictionary with source and analysis. Defaults to False.
        verbose (bool, optional): If True, prints debug messages. Defaults to False.

    Returns:
        Union[str, Dict[str, Union[str, int]], None]: The corrected author name as a string if return_details is False;
            a detailed dict if return_details is True; or None if no match is found.
    """
    details = {"author_original": author, "author_corrected": None, "source": None}

    # 1. Vérification dans rss:metadata (titre, description)
    match = self._check_author_source(author, self.authors_titre_description)
    if match:
        details["author_corrected"] = match
        details["source"] = "rss:metadata"
        if verbose:
            print(f"Trouvé avec rss:metadata: {match}")
        return details if return_details else match

    # 2. Vérification dans la base de données (mongodb:auteurs)
    list_db_auteurs = [auteur.nom for auteur in Auteur.get_entries()]
    match = self._check_author_source(author, list_db_auteurs)
    if match:
        details["author_corrected"] = match
        details["source"] = "mongodb:auteurs"
        if verbose:
            print(f"Trouvé avec mongodb:auteurs: {match}")
        return details if return_details else match

    # 3. Vérification via llm
    list_llm_auteurs = self._get_authors_from_llm(author)
    match = self._check_author_source(author, list_llm_auteurs)
    if match:
        details["author_corrected"] = match
        details["source"] = "llm"
        if verbose:
            print(f"Trouvé avec llm: {match}")
        return details if return_details else match

    # 4. Vérification via web search
    web_result_dict = self._get_author_from_web(author)
    match = web_result_dict.get("auteur")
    score = web_result_dict.get("certitude", 0)
    details.update(
        {
            "author_corrected": match,
            "score": score,
            "analyse": web_result_dict.get("analyse", ""),
            "source": "web search",
        }
    )
    if score >= score_fuzz_threshold:
        if verbose:
            print(f"Trouvé avec web search: {match}")
        return details if return_details else match
    else:
        if verbose:
            print(
                f"Score insuffisant {score} avec web search: {web_result_dict.get('analyse', '')}"
            )
        details["author_corrected"] = None
        return details if return_details else None

`AuthorFuzzMatcher` ¶

Source code in nbs/mongo_auteur.py

class AuthorFuzzMatcher:
    def __init__(self, reference_authors: Optional[List[str]] = None) -> None:
        """Initializes an AuthorFuzzMatcher with a list of known author names.

        Args:
            reference_authors (Optional[List[str]]): A list of known author names. Defaults to None.
        """
        self.reference_authors = set(reference_authors) if reference_authors else set()

    def add_reference_author(self, author: str) -> None:
        """Adds a new reference author to the set.

        Args:
            author (str): The author name to be added.
        """
        self.reference_authors.add(author.strip())

    def find_best_match(
        self, name: str, min_score: int = 80
    ) -> Tuple[Optional[str], int]:
        """Finds the best matching reference author for a given name using token set ratio.

        Args:
            name (str): The name to match against the reference authors.
            min_score (int, optional): The minimal score required for a match. Defaults to 80.

        Returns:
            Tuple[Optional[str], int]: A tuple with the best matching author's name (or None if no match satisfies the minimum score) and the matching score.
        """
        if not name or not self.reference_authors:
            return None, 0
        best_match, score = process.extractOne(
            name, self.reference_authors, scorer=fuzz.token_set_ratio
        )
        if score >= min_score:
            return best_match, score
        return None, score

`init(reference_authors=None)` ¶

Initializes an AuthorFuzzMatcher with a list of known author names.

Parameters:

Name	Type	Description	Default
`reference_authors`	`Optional[List[str]]`	A list of known author names. Defaults to None.	`None`

Source code in nbs/mongo_auteur.py

def __init__(self, reference_authors: Optional[List[str]] = None) -> None:
    """Initializes an AuthorFuzzMatcher with a list of known author names.

    Args:
        reference_authors (Optional[List[str]]): A list of known author names. Defaults to None.
    """
    self.reference_authors = set(reference_authors) if reference_authors else set()

`add_reference_author(author)` ¶

Adds a new reference author to the set.

Parameters:

Name	Type	Description	Default
`author`	`str`	The author name to be added.	required

Source code in nbs/mongo_auteur.py

def add_reference_author(self, author: str) -> None:
    """Adds a new reference author to the set.

    Args:
        author (str): The author name to be added.
    """
    self.reference_authors.add(author.strip())

`find_best_match(name, min_score=80)` ¶

Finds the best matching reference author for a given name using token set ratio.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name to match against the reference authors.	required
`min_score`	`int`	The minimal score required for a match. Defaults to 80.	`80`

Returns:

Type	Description
`Tuple[Optional[str], int]`	Tuple[Optional[str], int]: A tuple with the best matching author's name (or None if no match satisfies the minimum score) and the matching score.

Source code in nbs/mongo_auteur.py

def find_best_match(
    self, name: str, min_score: int = 80
) -> Tuple[Optional[str], int]:
    """Finds the best matching reference author for a given name using token set ratio.

    Args:
        name (str): The name to match against the reference authors.
        min_score (int, optional): The minimal score required for a match. Defaults to 80.

    Returns:
        Tuple[Optional[str], int]: A tuple with the best matching author's name (or None if no match satisfies the minimum score) and the matching score.
    """
    if not name or not self.reference_authors:
        return None, 0
    best_match, score = process.extractOne(
        name, self.reference_authors, scorer=fuzz.token_set_ratio
    )
    if score >= min_score:
        return best_match, score
    return None, score

`google_search(query)` ¶

Effectue une recherche Google en utilisant l'API Custom Search et retourne les résultats.

Parameters:

Name	Type	Description	Default
`query`	`str`	La requête de recherche.	required

Returns:

Type	Description
`Optional[List[Dict[str, Optional[str]]]]`	Optional[List[Dict[str, Optional[str]]]]: Une liste de dictionnaires représentant les résultats de la recherche, chaque dictionnaire contenant les clés 'title', 'snippet' et 'link'. Retourne None en cas d'erreur.

Raises: ValueError: Si les variables d'environnement GOOGLE_CUSTOM_SEARCH_API_KEY ou SEARCH_ENGINE_ID ne sont pas définies.

Source code in nbs/mongo_auteur.py

def google_search(query: str) -> Optional[List[Dict[str, Optional[str]]]]:
    """Effectue une recherche Google en utilisant l'API Custom Search et retourne les résultats.

    Args:
        query (str): La requête de recherche.

    Returns:
        Optional[List[Dict[str, Optional[str]]]]:
            Une liste de dictionnaires représentant les résultats de la recherche, chaque dictionnaire contenant
            les clés 'title', 'snippet' et 'link'. Retourne None en cas d'erreur.
    Raises:
        ValueError: Si les variables d'environnement GOOGLE_CUSTOM_SEARCH_API_KEY ou SEARCH_ENGINE_ID
                    ne sont pas définies.
    """
    if not api_key or not cse_id:
        raise ValueError(
            "Les variables d'environnement GOOGLE_CUSTOM_SEARCH_API_KEY et SEARCH_ENGINE_ID doivent être définies pour utiliser la recherche Google."
        )
    try:
        service = build("customsearch", "v1", developerKey=api_key)
        res = service.cse().list(q=query, cx=cse_id).execute()

        results: List[Dict[str, Optional[str]]] = []
        for item in res.get("items", []):
            title: Optional[str] = item.get("title")
            snippet: Optional[str] = item.get("snippet")
            link: Optional[str] = item.get("link")
            results.append({"title": title, "snippet": snippet, "link": link})
        return results
    except Exception as e:
        print(f"Erreur lors de la recherche Google: {e}")
        return None

Module mongo_auteur¶

Auteur ¶

__init__(nom) ¶

AuthorChecker ¶

__init__(episode) ¶

check_author(author, return_details=False, verbose=False) ¶

AuthorFuzzMatcher ¶

__init__(reference_authors=None) ¶

add_reference_author(author) ¶

find_best_match(name, min_score=80) ¶

google_search(query) ¶

`Auteur` ¶

`init(nom)` ¶

`AuthorChecker` ¶

`init(episode)` ¶

`check_author(author, return_details=False, verbose=False)` ¶

`AuthorFuzzMatcher` ¶

`init(reference_authors=None)` ¶

`add_reference_author(author)` ¶

`find_best_match(name, min_score=80)` ¶

`google_search(query)` ¶