socket-sdk-python/socketdev/utils/__init__.py at main · SocketDev/socket-sdk-python

History

442 lines (365 loc) · 17.3 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

from typing import Literal, List, Tuple, Optional, Union

import logging

import os

import weakref

from threading import Lock

import tarfile

import tempfile

import io

log = logging.getLogger("socketdev")

IntegrationType = Literal["api", "github", "gitlab", "bitbucket", "azure"]

INTEGRATION_TYPES = ("api", "github", "gitlab", "bitbucket", "azure")

class FileDescriptorManager:

"""

Global manager to track and limit the number of open file descriptors.

Automatically closes least recently used files when limit is reached.

"""

_instance = None

_lock = Lock()

def __new__(cls):

if cls._instance is None:

with cls._lock:

if cls._instance is None:

cls._instance = super().__new__(cls)

cls._instance._initialized = False

return cls._instance

def __init__(self):

if not self._initialized:

self.max_open_files = 100 # Default limit, can be overridden

self.open_files = [] # List of weakrefs to LazyFileLoader instances

self._initialized = True

log.debug(f"FileDescriptorManager initialized with default max_open_files={self.max_open_files}")

def set_max_open_files(self, max_files: int):

"""Set the maximum number of open files."""

with self._lock:

self.max_open_files = max_files

log.debug(f"FileDescriptorManager max_open_files set to {self.max_open_files}")

# If we're now over the limit, close some files

while len(self.open_files) >= self.max_open_files:

self.open_files = [ref for ref in self.open_files if ref() is not None]

if len(self.open_files) >= self.max_open_files and self.open_files:

oldest_ref = self.open_files.pop(0)

oldest_file = oldest_ref()

if oldest_file is not None and oldest_file._file is not None:

oldest_file.close()

log.debug(f"Auto-closed file due to new descriptor limit: {oldest_file.file_path}")

else:

break

def register_file_open(self, lazy_file_loader):

"""Register a file as opened and manage the descriptor limit."""

with self._lock:

# Remove any dead weak references

self.open_files = [ref for ref in self.open_files if ref() is not None]

# If we're at the limit, close the oldest file

if len(self.open_files) >= self.max_open_files:

oldest_ref = self.open_files.pop(0)

oldest_file = oldest_ref()

if oldest_file is not None and oldest_file._file is not None:

oldest_file.close()

log.debug(f"Auto-closed file due to descriptor limit: {oldest_file.file_path}")

# Add the new file to the end of the list

self.open_files.append(weakref.ref(lazy_file_loader))

def unregister_file(self, lazy_file_loader):

"""Remove a file from the tracking list when it's closed."""

with self._lock:

self.open_files = [ref for ref in self.open_files

if ref() is not None and ref() is not lazy_file_loader]

# Global instance

_fd_manager = FileDescriptorManager()

class LazyFileLoader:

"""

A file-like object that only opens the actual file when needed for reading.

This prevents keeping too many file descriptors open simultaneously.

This class implements the standard file-like interface that requests library

expects for multipart uploads, making it a drop-in replacement for regular

file objects.

"""

def __init__(self, file_path: str, name: str):

self.file_path = file_path

self.name = name

self._file = None

self._closed = False

self._position = 0

self._size = None

def _ensure_open(self):

"""Ensure the file is open and seek to the correct position."""

if self._closed:

raise ValueError("I/O operation on closed file.")

if self._file is None:

try:

self._file = open(self.file_path, 'rb')

_fd_manager.register_file_open(self)

log.debug(f"Opened file for reading: {self.file_path}")

# Seek to the current position if we've been reading before

if self._position > 0:

self._file.seek(self._position)

except OSError as e:

if e.errno == 24: # Too many open files

# Try to force garbage collection to close unused files

import gc

gc.collect()

# Retry once

self._file = open(self.file_path, 'rb')

_fd_manager.register_file_open(self)

log.debug(f"Opened file for reading (after gc): {self.file_path}")

if self._position > 0:

self._file.seek(self._position)

else:

raise

def _get_size(self):

"""Get file size without keeping file open."""

if self._size is None:

self._size = os.path.getsize(self.file_path)

return self._size

def read(self, size: int = -1):

"""Read from the file, opening it if needed."""

self._ensure_open()

assert self._file is not None

data = self._file.read(size)

self._position = self._file.tell()

# If we've read the entire file, close it to free the file descriptor

if size == -1 or len(data) < size:

self.close()

return data

def readline(self, size: int = -1):

"""Read a line from the file."""

self._ensure_open()

assert self._file is not None

data = self._file.readline(size)

self._position = self._file.tell()

return data

def seek(self, offset: int, whence: int = 0):

"""Seek to a position in the file."""

if self._closed:

raise ValueError("I/O operation on closed file.")

# Calculate new position for tracking

if whence == 0: # SEEK_SET

self._position = offset

elif whence == 1: # SEEK_CUR

self._position += offset

elif whence == 2: # SEEK_END

# We need to open the file to get its size

self._ensure_open()

assert self._file is not None

result = self._file.seek(offset, whence)

self._position = self._file.tell()

return result

# If file is already open, seek it too

if self._file is not None:

result = self._file.seek(self._position)

return result

return self._position

def tell(self):

"""Return current file position."""

if self._closed:

raise ValueError("I/O operation on closed file.")

if self._file is not None:

self._position = self._file.tell()

return self._position

def close(self):

"""Close the file if it was opened."""

if self._file is not None:

self._file.close()

log.debug(f"Closed file: {self.file_path}")

self._file = None

_fd_manager.unregister_file(self)

self._closed = True

def __enter__(self):

return self

def __exit__(self, exc_type, exc_val, exc_tb):

self.close()

def __len__(self):

"""Return file size. Requests library uses this for Content-Length."""

return self._get_size()

@property

def closed(self):

"""Check if the file is closed."""

return self._closed

@property

def mode(self):

"""Return the file mode."""

return 'rb'

def readable(self):

"""Return whether the file is readable."""

return not self._closed

def writable(self):

"""Return whether the file is writable."""

return False

def seekable(self):

"""Return whether the file supports seeking."""

return True

class Utils:

@staticmethod

def validate_integration_type(integration_type: str) -> IntegrationType:

if integration_type not in INTEGRATION_TYPES:

raise ValueError(f"Invalid integration type: {integration_type}")

return integration_type # type: ignore

@staticmethod

def load_files_for_sending_lazy(files: List[str], workspace: Optional[str] = None, max_open_files: int = 100, base_path: Optional[str] = None, base_paths: Optional[List[str]] = None) -> List[Tuple[str, Tuple[str, LazyFileLoader]]]:

"""

Prepares files for sending to the Socket API using lazy loading.

This version doesn't open all files immediately, instead it creates

LazyFileLoader objects that only open files when they're actually read.

This prevents "Too many open files" errors when dealing with large numbers

of manifest files.

Args:

files: List of file paths from find_files()

workspace: Base directory path to make paths relative to

max_open_files: Maximum number of files to keep open simultaneously (default: 100)

base_path: Optional base path to strip from key names for cleaner file organization

base_paths: Optional list of base paths to strip from key names (takes precedence over base_path)

Returns:

List of tuples formatted for requests multipart upload:

[(field_name, (filename, lazy_file_object)), ...]

"""

# Configure the file descriptor manager with the specified limit

_fd_manager.set_max_open_files(max_open_files)

send_files = []

if workspace and "\\" in workspace:

workspace = workspace.replace("\\", "/")

if base_path and "\\" in base_path:

base_path = base_path.replace("\\", "/")

if base_paths:

base_paths = [bp.replace("\\", "/") if "\\" in bp else bp for bp in base_paths]

for file_path in files:

# Normalize file path

if "\\" in file_path:

file_path = file_path.replace("\\", "/")

# Skip directories

if os.path.isdir(file_path):

continue

# Handle file path splitting safely

if "/" in file_path:

_, name = file_path.rsplit("/", 1)

else:

name = file_path

# Calculate the key name for the form data

key = file_path

path_stripped = False

# If base_paths is provided, try to strip one of the paths from the file path

if base_paths:

for bp in base_paths:

normalized_base_path = bp.rstrip("/") + "/" if not bp.endswith("/") else bp

if key.startswith(normalized_base_path):

key = key[len(normalized_base_path):]

path_stripped = True

break

elif key.startswith(bp.rstrip("/")):

stripped_base = bp.rstrip("/")

if key.startswith(stripped_base + "/") or key == stripped_base:

key = key[len(stripped_base):]

key = key.lstrip("/")

path_stripped = True

break

elif base_path:

normalized_base_path = base_path.rstrip("/") + "/" if not base_path.endswith("/") else base_path

if key.startswith(normalized_base_path):

key = key[len(normalized_base_path):]

path_stripped = True

elif key.startswith(base_path.rstrip("/")):

stripped_base = base_path.rstrip("/")

if key.startswith(stripped_base + "/") or key == stripped_base:

key = key[len(stripped_base):]

key = key.lstrip("/")

path_stripped = True

# If workspace is provided and no base paths matched, fall back to workspace logic

if not path_stripped and workspace and file_path.startswith(workspace):

key = file_path[len(workspace):]

# Remove all leading slashes (for absolute paths)

while key.startswith("/"):

key = key[1:]

path_stripped = True

# Clean up relative path prefixes, but preserve filename dots

while key.startswith("./"):

key = key[2:]

while key.startswith("../"):

key = key[3:]

# Remove any remaining leading slashes (for absolute paths)

while key.startswith("/"):

key = key[1:]

# Remove Windows drive letter if present (C:/...)

if len(key) > 2 and key[1] == ':' and (key[2] == '/' or key[2] == '\\'):

key = key[2:]

while key.startswith("/"):

key = key[1:]

# Create lazy file loader instead of opening file immediately

lazy_file = LazyFileLoader(file_path, key)

payload = (key, (key, lazy_file))

send_files.append(payload)

log.debug(f"Prepared {len(send_files)} files for lazy loading")

return send_files

@staticmethod

def create_tar_gz_from_files(files: List[str], workspace: Optional[str] = None) -> io.BytesIO:

"""

Create a tar.gz archive from a list of files.

Args:

files: List of file paths to include in the archive

workspace: Base directory path to make paths relative to

Returns:

io.BytesIO: In-memory tar.gz archive

"""

tar_buffer = io.BytesIO()

# Normalize workspace path

if workspace and "\\" in workspace:

workspace = workspace.replace("\\", "/")

if workspace:

workspace = workspace.rstrip("/")

with tarfile.open(fileobj=tar_buffer, mode='w:gz') as tar:

for file_path in files:

# Normalize file path

normalized_path = file_path.replace("\\", "/") if "\\" in file_path else file_path

# Skip if file doesn't exist

if not os.path.exists(normalized_path):

log.warning(f"File not found, skipping: {normalized_path}")

continue

# Skip directories

if os.path.isdir(normalized_path):

log.debug(f"Skipping directory: {normalized_path}")

continue

# Calculate arcname (the name in the archive)

arcname = normalized_path

if workspace:

workspace_with_slash = workspace + "/"

if normalized_path.startswith(workspace_with_slash):

arcname = normalized_path[len(workspace_with_slash):]

elif normalized_path.startswith(workspace):

arcname = normalized_path[len(workspace):].lstrip("/")

# Clean up relative path prefixes

while arcname.startswith("./"):

arcname = arcname[2:]

while arcname.startswith("../"):

arcname = arcname[3:]

arcname = arcname.lstrip("/")

# Remove Windows drive letter if present

if len(arcname) > 2 and arcname[1] == ':' and (arcname[2] == '/' or arcname[2] == '\\'):

arcname = arcname[2:].lstrip("/")

log.debug(f"Adding to archive: {normalized_path} as {arcname}")

tar.add(normalized_path, arcname=arcname)

# Seek to beginning so it can be read

tar_buffer.seek(0)

log.debug(f"Created tar.gz archive with {len(files)} files")

return tar_buffer

@staticmethod

def prepare_archive_files_for_upload(tar_files: Union[str, List[str]]) -> List[Tuple[str, Tuple[str, LazyFileLoader]]]:

"""

Prepare archive files for upload to the API.

Args:

tar_files: Path or list of paths to archive files (.tar, .tar.gz, .tgz, .zip)

Returns:

List of tuples formatted for requests multipart upload

"""

files_list = [tar_files] if isinstance(tar_files, str) else tar_files

prepared_files = []

for file_path in files_list:

# Normalize path

normalized_path = file_path.replace("\\", "/") if "\\" in file_path else file_path

# Get filename

if "/" in normalized_path:

_, filename = normalized_path.rsplit("/", 1)

else:

filename = normalized_path

# Create lazy file loader

lazy_file = LazyFileLoader(normalized_path, filename)

prepared_files.append(("file", (filename, lazy_file)))

log.debug(f"Prepared {len(prepared_files)} archive files for upload")

return prepared_files

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

__init__.py

Latest commit

History

__init__.py

File metadata and controls

init.py

init.py