Thi Notes
AboutNotesBlogTopicsToolsReading
About|Sketches |Cooking |Cafe icon Support Thi
Notes

Jupyter notebook snippets

Jupyter notebook snippets

Anh-Thi Dinh
draft
Python
Backend
Data Science
⚠️
This is a quick & dirty draft, for me only!
1%%capture
2%pip install pymongo pandas
Install a new package inside the notebook, without logs.
1%load_ext autoreload
2%autoreload 2
Auto reload the internal package.
1import os
2from dotenv import load_dotenv
3
4# .env is the same location as the notebook
5load_dotenv(override=True)
6
7# .env in the parent folder of the notebook
8notebook_dir = Path(os.popen('pwd').read().strip())
9env_path = notebook_dir.parent / '.env'
10load_dotenv(dotenv_path=env_path, override=True)
Load .env file.
1def get_files_from_json(file_path: str) -> list[str]:
2    with open(file_path, 'r', encoding='utf-8') as f:
3        data = json.load(f)
4    return data
1def save_files_to_json(data, file_path: str):
2    with open(file_path, 'w', encoding='utf-8') as f:
3        json.dump(data, f, ensure_ascii=False, indent=4)
1def get_files_from_folder(folder_path: str, extension: str = "pdf") -> list[str]:
2    return [f.name for f in Path(folder_path).glob(f"*.{extension}")]
1def query_database(query: str, database_url: str = None, json_columns: list[str] = None) -> list[dict]:
2    """
3    Execute a SQL query and return results as a list of dictionaries.
4    
5    Args:
6        query: SQL query string to execute
7        database_url: Database connection string. If None, uses CONNECTION_STRING_DB from environment
8        json_columns: List of column names whose content should be parsed as JSON objects
9        
10    Returns:
11        List of dictionaries, where each dictionary represents a row with column names as keys
12        
13    Example:
14        >>> rows = query_database(
15        ...     "SELECT * FROM document WHERE bot = '-Obbm0B_36eDCTtOhHcN' LIMIT 10",
16        ...     json_columns=['metadata', 'config']
17        ... )
18        >>> print(rows[0]['metadata']['filename'])  # metadata is now a dict, not a string
19    """
20    if database_url is None:
21        database_url = os.getenv('CONNECTION_STRING_DB')
22        if database_url is None:
23            raise ValueError("Database URL not provided and CONNECTION_STRING_DB not set in environment")
24    
25    engine = create_engine(database_url)
26    
27    with engine.connect() as conn:
28        result = conn.execute(text(query))
29        rows = [dict(row._mapping) for row in result]
30    
31    # Parse JSON columns if specified
32    if json_columns:
33        for row in rows:
34            for col in json_columns:
35                if col in row and row[col] is not None:
36                    try:
37                        # Parse JSON string to Python object
38                        if isinstance(row[col], str):
39                            row[col] = json.loads(row[col])
40                    except (json.JSONDecodeError, TypeError):
41                        # Keep original value if parsing fails
42                        pass
43    
44    return rows