I have the following code:
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler, PatternMatchingEventHandler
import duckdb
path = "landing/persistent/"
global con
con = duckdb.connect(database='formatted/my-db.duckdb', read_only=False)
class EventHandler(PatternMatchingEventHandler):
def __init__(self, query):
PatternMatchingEventHandler.__init__(
self,
patterns=["*.csv"],
ignore_directories=True,
case_sensitive=False,
)
self.query = query
def on_created(self, event):
filename = event.src_path.split("/")[-1]
filename_trunc = filename.split(".")[0]
try:
cursor = con.cursor()
cursor.execute(query.format(filename_trunc),
[event.src_path])
cursor.execute("show tables")
print(cursor.fetchall())
except Exception as e:
print(e)
finally:
cursor.close()
query = "CREATE TABLE {} AS SELECT * FROM read_csv_auto(?);"
event_handler = EventHandler(query)
observer = Observer()
observer.schedule(event_handler, path, recursive=True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
It basically waits for a file in the path and adds it to the db as a new table. However, I have two extra use cases:
- Add a new csv with the some modifications (with a table already created for that filename) and maybe some rows already exist in the table.
- Add a new csv with a new column (with a table already created for that filename)
However, I don't know how efficiently keep track of these two cases using sql nor duckdb. Any help would be appreciated, thanks.
from Add columns to a table or records without duplicates in Duckdb
No comments:
Post a Comment