Skip to content

Module mongo_auteur

Auteur

Bases: BaseEntity

Source code in nbs/mongo_auteur.py
19
20
21
22
23
24
25
26
27
28
29
30
31
class Auteur(BaseEntity):
    collection: str = "auteurs"

    def __init__(self, nom: str) -> None:
        """Initialise une instance d'Auteur.

        Args:
            nom (str): Le nom de l'auteur.
        """
        super().__init__(nom, self.collection)

    def some_method(self):
        fmt_date = format_date(self.episode.date, "%Y/%m/%d")

__init__(nom)

Initialise une instance d'Auteur.

Parameters:

Name Type Description Default
nom str

Le nom de l'auteur.

required
Source code in nbs/mongo_auteur.py
22
23
24
25
26
27
28
def __init__(self, nom: str) -> None:
    """Initialise une instance d'Auteur.

    Args:
        nom (str): Le nom de l'auteur.
    """
    super().__init__(nom, self.collection)

AuthorChecker

Class to verify and correct an author's name using multiple data sources.

This class verifies an author in an episode through sources including
  • RSS metadata (title, description)
  • MongoDB database of known authors
  • LLM suggestions
  • Web search analysis
Source code in nbs/mongo_auteur.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
class AuthorChecker:
    """Class to verify and correct an author's name using multiple data sources.

    This class verifies an author in an episode through sources including:
      - RSS metadata (title, description)
      - MongoDB database of known authors
      - LLM suggestions
      - Web search analysis
    """

    def __init__(self, episode: Episode) -> None:
        """Initializes the AuthorChecker with an episode.

        Args:
            episode (Episode): An episode instance containing title and description.
        """
        self.episode = episode
        self.llm_structured_output = get_azure_llm("gpt-4o")
        self.authors_titre_description = self._get_authors_from_titre_description()

    def _get_filtered_titre_description(self, titre_or_description: str) -> str:
        """Filter the given titre or description to avoid Error 400.

        Filters out substrings that may trigger Azure OpenAI's content management policy (resulting in a 400 error).
        Specifically, for certain dates, predefined terms are replaced as specified in the filter mapping.

        For more details, see:
            https://github.com/castorfou/lmelp/issues/21

        Args:
            titre_or_description (str): 'titre' or 'description'depending on what to filter.

        Returns:
            str: The filtered titre or description.
        """
        filtering = {
            "2020/11/15": {"fossoyeur": "rigolo"},
        }
        fmt_date = self.episode.date.strftime("%Y/%m/%d")
        replacements = filtering.get(fmt_date)

        text = (
            self.episode.titre
            if titre_or_description == "titre"
            else self.episode.description
        )
        if replacements:
            for key, value in replacements.items():
                text = text.replace(key, value)
        return text

    def _get_authors_from_titre_description(self) -> List[str]:
        """Retrieves a list of author names extracted from the episode title and description using LLM.

        Returns:
            List[str]: A list of author names extracted from the title and description.
        """
        response_schema = {
            "type": "json_schema",
            "json_schema": {
                "name": "AuthorTitreDescriptionList",
                "schema": {
                    "type": "object",
                    "properties": {
                        "Authors_TitreDescription": {
                            "type": "array",
                            "items": {
                                "type": "string",
                                "description": "A list of names from title and description",
                            },
                        }
                    },
                    "required": ["Authors_TitreDescription"],
                    "additionalProperties": False,
                },
            },
        }
        try:
            titre = self._get_filtered_titre_description("titre")
            description = self._get_filtered_titre_description("description")
            response = self.llm_structured_output.chat(
                messages=[
                    ChatMessage(
                        role="system",
                        content="Tu es un assistant utile qui retourne une liste JSON de noms.",
                    ),
                    ChatMessage(
                        role="user",
                        content=f"Est-ce que tu peux me lister tous les noms qui sont cités dans le titre et la description de l'épisode suivant : {titre} {description}. ",
                    ),
                ],
                response_format=response_schema,
            )
        except Exception as e:
            print(f"Error getting authors from titre/description: {e}")
            print(f"prompt: {titre} {description}")
            return []
        try:
            json_dict = json.loads(response.message.content)
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            print("Raw response:", response.message.content)
            return []  # Return an empty list if parsing fails
        return json_dict["Authors_TitreDescription"]

    def _get_authors_from_llm(self, autor: str) -> List[str]:
        """Queries the LLM to retrieve a list of potential author names based on a provided name.

        Args:
            autor (str): The author name to query.

        Returns:
            List[str]: A list of author names suggested by the LLM.
        """
        response_schema = {
            "type": "json_schema",
            "json_schema": {
                "name": "AuthorList",
                "schema": {
                    "type": "object",
                    "properties": {
                        "Authors_LLM": {
                            "type": "array",
                            "items": {
                                "type": "string",
                                "description": "A list of authors' names",
                            },
                        }
                    },
                    "required": ["Authors_LLM"],
                    "additionalProperties": False,
                },
            },
        }

        prompt = (
            """
        Tu es un agent expert en littérature.
        Donne moi quelques auteurs dont le nom s'approche de celui-ci : """
            + autor
            + """

        S'il s'agit deja d'un auteur connu, retourne moi juste son nom. S'il y a une erreur dans le nom que je t'ai donne, corrige moi en me donnant le nom de l'auteur que tu penses que j'ai voulu dire.

        Je veux que tu me donnes le prenom puis le nom dans cet ordre. Par exemple "Marcel Pagnol" ou "Victor Hugo".
        Ces auteurs sont susceptibles d'etre discutes dans "Le Masque et la Plume".

        Si tu me retournes plusieurs auteurs, fais le sous forme de liste par exemple si tu as identifie "auteur 1" et "auteur 2" alors retourne ["auteur 1", "auteur 2"]
        """
        )

        response = self.llm_structured_output.chat(
            messages=[
                ChatMessage(
                    role="system",
                    content="Tu es un agent litteraire qui connait parfaitement les auteurs.",
                ),
                ChatMessage(role="user", content=f"{prompt}. "),
            ],
            response_format=response_schema,
        )

        try:
            json_dict = json.loads(response.message.content)
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            print("Raw response:", response.message.content)
            return []
        return json_dict["Authors_LLM"]

    def _get_author_from_web(self, author: str) -> Dict[str, Union[str, int]]:
        """Analyzes a Google search result to verify if a given name corresponds to an author.

        Args:
            author (str): The author name to verify.

        Returns:
            Dict[str, Union[str, int]]: A dictionary containing:
                - "auteur": The corrected author name if applicable.
                - "certitude": An integer between 0 and 100 indicating the confidence.
                - "analyse": A textual analysis of the Google search query.
        """
        result_google = google_search(author)

        prompt_incertitude_auteur = f"""
        Voici le resultat d'une requete google concernant un probable auteur inconnu de mon llm : {author}
        La requete est au format dict avec du json a l'interieur.
        Est-ce que tu peux analyser le contenu de cette requete et me dire si oui ou non {author} est un auteur de livres, 
        et accompagner ta reponse d'un pourcentage de certitude :
        * 100% de certitude signifie que tu es certain que {author} est un auteur de livres
        *  50% tu es ni sure ni pas sure que {author} est un auteur de livres
        *   0% tu es certain que {author} n'est pas un auteur de livres

        Voici le contenu de la requete google : {result_google}

        Tu repondras uniquement avec un dictionnaire qui va contenir 3 entrees :

        - "auteur" : le nom de l'auteur, eventuellement corrige si j'ai oublie des accents ou une faute de frappe
        - "certitude" : le pourcentage de certitude de 0 à 100, un entier
        - "analyse" : une analyse de la requete Google concernant l'auteur.
        """
        response_schema = {
            "type": "json_schema",
            "json_schema": {
                "name": "AuteurSchema",
                "schema": {
                    "type": "object",
                    "properties": {
                        "auteur": {
                            "type": "string",
                            "description": "Le nom de l'auteur éventuellement corrigé (accents, fautes de frappe).",
                        },
                        "certitude": {
                            "type": "integer",
                            "description": "Pourcentage de certitude (0 à 100).",
                            "minimum": 0,
                            "maximum": 100,
                        },
                        "analyse": {
                            "type": "string",
                            "description": "Analyse de la requête Google concernant l'auteur.",
                        },
                    },
                    "required": ["auteur", "certitude", "analyse"],
                    "additionalProperties": False,
                },
            },
        }

        response = self.llm_structured_output.chat(
            messages=[
                ChatMessage(
                    role="system",
                    content="Tu es un assistant utile qui analyse des requetes Google pour y deceler si un auteur de livre s'y cache.",
                ),
                ChatMessage(
                    role="user",
                    content=prompt_incertitude_auteur,
                ),
            ],
            response_format=response_schema,
        )
        try:
            json_dict = json.loads(response.message.content)
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            print("Raw response:", response.message.content)
            return {}
        return json_dict

    def _check_author_source(
        self, author: str, authors_list: List[str]
    ) -> Optional[str]:
        """Determines the best matching author from a provided list using fuzzy matching.

        Args:
            author (str): The author name to match.
            authors_list (List[str]): A list of author names to check against.

        Returns:
            Optional[str]: The best matching author name if the match score is above the threshold, otherwise None.
        """
        matcher = AuthorFuzzMatcher(authors_list)
        best_match, score = matcher.find_best_match(author)
        if score >= score_fuzz_threshold:
            return best_match
        else:
            return None

    def check_author(
        self, author: str, return_details: bool = False, verbose: bool = False
    ) -> Union[str, Dict[str, Union[str, int]], None]:
        """Verifies an author's name through various sources and returns the corrected name.

        It checks in the following order:
          1. RSS metadata (title, description)
          2. MongoDB list of known authors
          3. LLM suggested names
          4. Web search analysis

        Args:
            author (str): The author name to verify.
            return_details (bool, optional): If True, returns a detailed dictionary with source and analysis. Defaults to False.
            verbose (bool, optional): If True, prints debug messages. Defaults to False.

        Returns:
            Union[str, Dict[str, Union[str, int]], None]: The corrected author name as a string if return_details is False;
                a detailed dict if return_details is True; or None if no match is found.
        """
        details = {"author_original": author, "author_corrected": None, "source": None}

        # 1. Vérification dans rss:metadata (titre, description)
        match = self._check_author_source(author, self.authors_titre_description)
        if match:
            details["author_corrected"] = match
            details["source"] = "rss:metadata"
            if verbose:
                print(f"Trouvé avec rss:metadata: {match}")
            return details if return_details else match

        # 2. Vérification dans la base de données (mongodb:auteurs)
        list_db_auteurs = [auteur.nom for auteur in Auteur.get_entries()]
        match = self._check_author_source(author, list_db_auteurs)
        if match:
            details["author_corrected"] = match
            details["source"] = "mongodb:auteurs"
            if verbose:
                print(f"Trouvé avec mongodb:auteurs: {match}")
            return details if return_details else match

        # 3. Vérification via llm
        list_llm_auteurs = self._get_authors_from_llm(author)
        match = self._check_author_source(author, list_llm_auteurs)
        if match:
            details["author_corrected"] = match
            details["source"] = "llm"
            if verbose:
                print(f"Trouvé avec llm: {match}")
            return details if return_details else match

        # 4. Vérification via web search
        web_result_dict = self._get_author_from_web(author)
        match = web_result_dict.get("auteur")
        score = web_result_dict.get("certitude", 0)
        details.update(
            {
                "author_corrected": match,
                "score": score,
                "analyse": web_result_dict.get("analyse", ""),
                "source": "web search",
            }
        )
        if score >= score_fuzz_threshold:
            if verbose:
                print(f"Trouvé avec web search: {match}")
            return details if return_details else match
        else:
            if verbose:
                print(
                    f"Score insuffisant {score} avec web search: {web_result_dict.get('analyse', '')}"
                )
            details["author_corrected"] = None
            return details if return_details else None

__init__(episode)

Initializes the AuthorChecker with an episode.

Parameters:

Name Type Description Default
episode Episode

An episode instance containing title and description.

required
Source code in nbs/mongo_auteur.py
149
150
151
152
153
154
155
156
157
def __init__(self, episode: Episode) -> None:
    """Initializes the AuthorChecker with an episode.

    Args:
        episode (Episode): An episode instance containing title and description.
    """
    self.episode = episode
    self.llm_structured_output = get_azure_llm("gpt-4o")
    self.authors_titre_description = self._get_authors_from_titre_description()

check_author(author, return_details=False, verbose=False)

Verifies an author's name through various sources and returns the corrected name.

It checks in the following order
  1. RSS metadata (title, description)
  2. MongoDB list of known authors
  3. LLM suggested names
  4. Web search analysis

Parameters:

Name Type Description Default
author str

The author name to verify.

required
return_details bool

If True, returns a detailed dictionary with source and analysis. Defaults to False.

False
verbose bool

If True, prints debug messages. Defaults to False.

False

Returns:

Type Description
Union[str, Dict[str, Union[str, int]], None]

Union[str, Dict[str, Union[str, int]], None]: The corrected author name as a string if return_details is False; a detailed dict if return_details is True; or None if no match is found.

Source code in nbs/mongo_auteur.py
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
def check_author(
    self, author: str, return_details: bool = False, verbose: bool = False
) -> Union[str, Dict[str, Union[str, int]], None]:
    """Verifies an author's name through various sources and returns the corrected name.

    It checks in the following order:
      1. RSS metadata (title, description)
      2. MongoDB list of known authors
      3. LLM suggested names
      4. Web search analysis

    Args:
        author (str): The author name to verify.
        return_details (bool, optional): If True, returns a detailed dictionary with source and analysis. Defaults to False.
        verbose (bool, optional): If True, prints debug messages. Defaults to False.

    Returns:
        Union[str, Dict[str, Union[str, int]], None]: The corrected author name as a string if return_details is False;
            a detailed dict if return_details is True; or None if no match is found.
    """
    details = {"author_original": author, "author_corrected": None, "source": None}

    # 1. Vérification dans rss:metadata (titre, description)
    match = self._check_author_source(author, self.authors_titre_description)
    if match:
        details["author_corrected"] = match
        details["source"] = "rss:metadata"
        if verbose:
            print(f"Trouvé avec rss:metadata: {match}")
        return details if return_details else match

    # 2. Vérification dans la base de données (mongodb:auteurs)
    list_db_auteurs = [auteur.nom for auteur in Auteur.get_entries()]
    match = self._check_author_source(author, list_db_auteurs)
    if match:
        details["author_corrected"] = match
        details["source"] = "mongodb:auteurs"
        if verbose:
            print(f"Trouvé avec mongodb:auteurs: {match}")
        return details if return_details else match

    # 3. Vérification via llm
    list_llm_auteurs = self._get_authors_from_llm(author)
    match = self._check_author_source(author, list_llm_auteurs)
    if match:
        details["author_corrected"] = match
        details["source"] = "llm"
        if verbose:
            print(f"Trouvé avec llm: {match}")
        return details if return_details else match

    # 4. Vérification via web search
    web_result_dict = self._get_author_from_web(author)
    match = web_result_dict.get("auteur")
    score = web_result_dict.get("certitude", 0)
    details.update(
        {
            "author_corrected": match,
            "score": score,
            "analyse": web_result_dict.get("analyse", ""),
            "source": "web search",
        }
    )
    if score >= score_fuzz_threshold:
        if verbose:
            print(f"Trouvé avec web search: {match}")
        return details if return_details else match
    else:
        if verbose:
            print(
                f"Score insuffisant {score} avec web search: {web_result_dict.get('analyse', '')}"
            )
        details["author_corrected"] = None
        return details if return_details else None

AuthorFuzzMatcher

Source code in nbs/mongo_auteur.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class AuthorFuzzMatcher:
    def __init__(self, reference_authors: Optional[List[str]] = None) -> None:
        """Initializes an AuthorFuzzMatcher with a list of known author names.

        Args:
            reference_authors (Optional[List[str]]): A list of known author names. Defaults to None.
        """
        self.reference_authors = set(reference_authors) if reference_authors else set()

    def add_reference_author(self, author: str) -> None:
        """Adds a new reference author to the set.

        Args:
            author (str): The author name to be added.
        """
        self.reference_authors.add(author.strip())

    def find_best_match(
        self, name: str, min_score: int = 80
    ) -> Tuple[Optional[str], int]:
        """Finds the best matching reference author for a given name using token set ratio.

        Args:
            name (str): The name to match against the reference authors.
            min_score (int, optional): The minimal score required for a match. Defaults to 80.

        Returns:
            Tuple[Optional[str], int]: A tuple with the best matching author's name (or None if no match satisfies the minimum score) and the matching score.
        """
        if not name or not self.reference_authors:
            return None, 0
        best_match, score = process.extractOne(
            name, self.reference_authors, scorer=fuzz.token_set_ratio
        )
        if score >= min_score:
            return best_match, score
        return None, score

__init__(reference_authors=None)

Initializes an AuthorFuzzMatcher with a list of known author names.

Parameters:

Name Type Description Default
reference_authors Optional[List[str]]

A list of known author names. Defaults to None.

None
Source code in nbs/mongo_auteur.py
43
44
45
46
47
48
49
def __init__(self, reference_authors: Optional[List[str]] = None) -> None:
    """Initializes an AuthorFuzzMatcher with a list of known author names.

    Args:
        reference_authors (Optional[List[str]]): A list of known author names. Defaults to None.
    """
    self.reference_authors = set(reference_authors) if reference_authors else set()

add_reference_author(author)

Adds a new reference author to the set.

Parameters:

Name Type Description Default
author str

The author name to be added.

required
Source code in nbs/mongo_auteur.py
51
52
53
54
55
56
57
def add_reference_author(self, author: str) -> None:
    """Adds a new reference author to the set.

    Args:
        author (str): The author name to be added.
    """
    self.reference_authors.add(author.strip())

find_best_match(name, min_score=80)

Finds the best matching reference author for a given name using token set ratio.

Parameters:

Name Type Description Default
name str

The name to match against the reference authors.

required
min_score int

The minimal score required for a match. Defaults to 80.

80

Returns:

Type Description
Tuple[Optional[str], int]

Tuple[Optional[str], int]: A tuple with the best matching author's name (or None if no match satisfies the minimum score) and the matching score.

Source code in nbs/mongo_auteur.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def find_best_match(
    self, name: str, min_score: int = 80
) -> Tuple[Optional[str], int]:
    """Finds the best matching reference author for a given name using token set ratio.

    Args:
        name (str): The name to match against the reference authors.
        min_score (int, optional): The minimal score required for a match. Defaults to 80.

    Returns:
        Tuple[Optional[str], int]: A tuple with the best matching author's name (or None if no match satisfies the minimum score) and the matching score.
    """
    if not name or not self.reference_authors:
        return None, 0
    best_match, score = process.extractOne(
        name, self.reference_authors, scorer=fuzz.token_set_ratio
    )
    if score >= min_score:
        return best_match, score
    return None, score

Effectue une recherche Google en utilisant l'API Custom Search et retourne les résultats.

Parameters:

Name Type Description Default
query str

La requête de recherche.

required

Returns:

Type Description
Optional[List[Dict[str, Optional[str]]]]

Optional[List[Dict[str, Optional[str]]]]: Une liste de dictionnaires représentant les résultats de la recherche, chaque dictionnaire contenant les clés 'title', 'snippet' et 'link'. Retourne None en cas d'erreur.

Raises: ValueError: Si les variables d'environnement GOOGLE_CUSTOM_SEARCH_API_KEY ou SEARCH_ENGINE_ID ne sont pas définies.

Source code in nbs/mongo_auteur.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def google_search(query: str) -> Optional[List[Dict[str, Optional[str]]]]:
    """Effectue une recherche Google en utilisant l'API Custom Search et retourne les résultats.

    Args:
        query (str): La requête de recherche.

    Returns:
        Optional[List[Dict[str, Optional[str]]]]:
            Une liste de dictionnaires représentant les résultats de la recherche, chaque dictionnaire contenant
            les clés 'title', 'snippet' et 'link'. Retourne None en cas d'erreur.
    Raises:
        ValueError: Si les variables d'environnement GOOGLE_CUSTOM_SEARCH_API_KEY ou SEARCH_ENGINE_ID
                    ne sont pas définies.
    """
    if not api_key or not cse_id:
        raise ValueError(
            "Les variables d'environnement GOOGLE_CUSTOM_SEARCH_API_KEY et SEARCH_ENGINE_ID doivent être définies pour utiliser la recherche Google."
        )
    try:
        service = build("customsearch", "v1", developerKey=api_key)
        res = service.cse().list(q=query, cx=cse_id).execute()

        results: List[Dict[str, Optional[str]]] = []
        for item in res.get("items", []):
            title: Optional[str] = item.get("title")
            snippet: Optional[str] = item.get("snippet")
            link: Optional[str] = item.get("link")
            results.append({"title": title, "snippet": snippet, "link": link})
        return results
    except Exception as e:
        print(f"Erreur lors de la recherche Google: {e}")
        return None