[hotfix] fix task count (#56)

ver217 · web-flow · commit 6403388f7c89 · 2024-11-27T18:19:51.000+08:00
* [hotfix] fix task count

* [hotfix] fix h2d count
diff --git a/csrc/aio.cpp b/csrc/aio.cpp
@@ -1,6 +1,6 @@
 #include "aio.h"
 
-AIOAsyncIO::AIOAsyncIO(unsigned int n_entries)
+AIOAsyncIO::AIOAsyncIO(unsigned int n_entries, unsigned int n_tasks)
 {
     // printf("Initializing the io Context\n");
     this->max_nr = n_entries;
diff --git a/csrc/async_file_io.cpp b/csrc/async_file_io.cpp
@@ -1,6 +1,6 @@
 #include "async_file_io.h"
 
-AsyncFileWriter::AsyncFileWriter(int fd, unsigned int n_entries, const std::string &backend) : fd(fd), aio(create_asyncio(n_entries, backend)) {}
+AsyncFileWriter::AsyncFileWriter(int fd, unsigned int n_entries, const std::string &backend, unsigned int n_tasks) : fd(fd), aio(create_asyncio(n_entries, backend, n_tasks)) {}
 
 void AsyncFileWriter::write(size_t buffer, size_t n_bytes, unsigned long long offset, callback_t callback)
 {
diff --git a/csrc/backend.cpp b/csrc/backend.cpp
@@ -44,23 +44,23 @@ void probe_asyncio(const std::string &backend)
         if (backend == "uring")
         {
 #ifndef DISABLE_URING
-            aio.reset(new UringAsyncIO(2));
+            aio.reset(new UringAsyncIO(2, 0));
 #else
             throw std::runtime_error("backend uring is not installed\n");
 #endif
         }
         else if (backend == "aio")
         {
 #ifndef DISABLE_AIO
-            aio.reset(new AIOAsyncIO(2));
+            aio.reset(new AIOAsyncIO(2, 0));
 #else
             throw std::runtime_error("backend aio is not installed\n");
 #endif
         }
         else if (backend == "pthread")
         {
 #ifndef DISABLE_PTHREAD
-            aio.reset(new PthreadAsyncIO(2));
+            aio.reset(new PthreadAsyncIO(2, 0));
 #else
             throw std::runtime_error("backend pthread is not installed\n");
 #endif
@@ -160,7 +160,7 @@ std::string get_debug_log()
     return std::string(env_);
 }
 
-AsyncIO *create_asyncio(unsigned int n_entries, std::string backend)
+AsyncIO *create_asyncio(unsigned int n_entries, std::string backend, unsigned int n_tasks)
 {
     std::unordered_set<std::string> backends = get_backends();
     std::string default_backend = get_default_backend();
@@ -188,15 +188,15 @@ AsyncIO *create_asyncio(unsigned int n_entries, std::string backend)
 
 #ifndef DISABLE_URING
     if (backend == "uring")
-        return new UringAsyncIO(n_entries);
+        return new UringAsyncIO(n_entries, n_tasks);
 #endif
 #ifndef DISABLE_AIO
     if (backend == "aio")
-        return new AIOAsyncIO(n_entries);
+        return new AIOAsyncIO(n_entries, n_tasks);
 #endif
 #ifndef DISABLE_PTHREAD
     if (backend == "pthread")
-        return new PthreadAsyncIO(n_entries);
+        return new PthreadAsyncIO(n_entries, n_tasks);
 #endif
     throw std::runtime_error("Unsupported backend: " + backend);
 }
diff --git a/csrc/offload.cpp b/csrc/offload.cpp
@@ -28,7 +28,7 @@ iovec *tensors_to_iovec(const std::vector<at::Tensor> &tensors)
 
 Offloader::Offloader(const std::string &filename, unsigned int n_entries, const std::string &backend) : filename(filename), space_mgr(SpaceManager(0))
 {
-    this->aio = create_asyncio(n_entries, backend);
+    this->aio = create_asyncio(n_entries, backend, 0);
     this->fd = open(filename.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
     this->aio->register_file(fd);
 }
diff --git a/csrc/pthread_backend.cpp b/csrc/pthread_backend.cpp
@@ -16,8 +16,8 @@ void PthreadAsyncIO::write(int fd, void *buffer, size_t n_bytes, unsigned long l
             auto val = pwrite(fd, buffer, n_bytes, offset);
             if (this->is_debug)
             {
-                auto cur_tasks = this->tasks_in_progress.fetch_sub(1);
-                if (cur_tasks == 1)
+                auto cur_tasks = this->tasks_in_progress.fetch_add(1);
+                if (cur_tasks + 1 == this->total_tasks)
                 {
                     if (this->debug_log.empty())
                     {
@@ -117,23 +117,23 @@ void PthreadAsyncIO::register_file(int fd) {}
 
 void PthreadAsyncIO::register_h2d(unsigned int num_tensors)
 {
-    this->h2d_in_progress.store(num_tensors); // register tensors to write for this run
+    this->total_h2d = num_tensors;
 }
 
 void PthreadAsyncIO::sync_h2d()
 {
     std::unique_lock<std::mutex> lock(this->mtx);
     this->cv.wait(lock, [this]
-                  { return this->h2d_in_progress == 0; }); // block until all in-progress h2d are completed
+                  { return this->h2d_in_progress == this->total_h2d; }); // block until all in-progress h2d are completed
 }
 
 void PthreadAsyncIO::write_tensor(int fd, torch::Tensor t, unsigned long long offset, callback_t callback, std::optional<torch::Tensor> pinned)
 {
     auto stream = c10::cuda::getCurrentCUDAStream();
     if (!t.is_cuda())
     {
-        this->h2d_in_progress.fetch_sub(1); // already moved to cpu
-        if (this->h2d_in_progress.load() == 0)
+        auto cur_h2d = this->h2d_in_progress.fetch_add(1); // already moved to cpu
+        if (cur_h2d + 1 == this->total_h2d)
         { // notify when all h2d are completed and safe to optimizer.step()
             std::lock_guard<std::mutex> lock(this->mtx);
             cv.notify_one();
@@ -155,8 +155,8 @@ void PthreadAsyncIO::write_tensor(int fd, torch::Tensor t, unsigned long long of
                 {
                     cpu_tensor = t.to(t.options().device(c10::DeviceType::CPU), /*non_blocking*/ false, /*copy*/ false); // modified from torch::Tensor::cpu()
                 }
-                this->h2d_in_progress.fetch_sub(1);
-                if (this->h2d_in_progress.load() == 0)
+                auto cur_h2d = this->h2d_in_progress.fetch_add(1);
+                if (cur_h2d + 1 == this->total_h2d)
                 { // notify when all h2d are completed and safe to optimizer.step()
                     std::lock_guard<std::mutex> lock(this->mtx);
                     cv.notify_one();
@@ -171,8 +171,8 @@ void PthreadAsyncIO::write_tensor(int fd, torch::Tensor t, unsigned long long of
             auto val = pwrite(fd, buf, n_bytes, offset);
             if (this->is_debug)
             {
-                auto cur_tasks = this->tasks_in_progress.fetch_sub(1);
-                if (cur_tasks == 1)
+                auto cur_tasks = this->tasks_in_progress.fetch_add(1);
+                if (cur_tasks + 1 == this->total_tasks)
                 {
                     if (this->debug_log.empty())
                     {
diff --git a/csrc/py_api.cpp b/csrc/py_api.cpp
@@ -27,7 +27,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("get_backends", get_backends);
     m.def("probe_backend", probe_backend, py::arg("backend"));
     py::class_<AsyncFileWriter>(m, "AsyncFileWriter")
-        .def(py::init<int, unsigned int, const std::string &>(), py::arg("fd"), py::arg("n_entries"), py::arg("backend") = "aio")
+        .def(py::init<int, unsigned int, const std::string &, unsigned int>(), py::arg("fd"), py::arg("n_entries"), py::arg("backend") = "aio", py::arg("n_tasks") = 0)
         .def("write", &AsyncFileWriter::write, py::arg("buffer"), py::arg("n_bytes"), py::arg("offset"), py::arg("callback") = py::none())
         .def("write_tensor", &AsyncFileWriter::write_tensor, py::arg("tensor"), py::arg("offset"), py::arg("callback") = py::none(), py::arg("pinned") = py::none())
         .def("synchronize", &AsyncFileWriter::synchronize)
diff --git a/csrc/uring.cpp b/csrc/uring.cpp
@@ -2,7 +2,7 @@
 #include <memory>
 #include "uring.h"
 
-UringAsyncIO::UringAsyncIO(unsigned int n_entries) : n_write_events(0), n_read_events(0), n_entries(n_entries)
+UringAsyncIO::UringAsyncIO(unsigned int n_entries, unsigned int n_tasks) : n_write_events(0), n_read_events(0), n_entries(n_entries)
 {
     io_uring_queue_init(n_entries, &this->ring, 0);
 }
diff --git a/include/aio.h b/include/aio.h
@@ -19,7 +19,7 @@ class AIOAsyncIO : public AsyncIO
     void get_event(WaitType wt);
 
 public:
-    AIOAsyncIO(unsigned int n_entries);
+    AIOAsyncIO(unsigned int n_entries, unsigned int n_tasks);
     ~AIOAsyncIO();
 
     void write(int fd, void *buffer, size_t n_bytes, unsigned long long offset, callback_t callback);
diff --git a/include/async_file_io.h b/include/async_file_io.h
@@ -17,7 +17,7 @@
 class AsyncFileWriter
 {
 public:
-    AsyncFileWriter(int fd, unsigned int n_entries, const std::string &backend);
+    AsyncFileWriter(int fd, unsigned int n_entries, const std::string &backend, unsigned int n_tasks);
     void write(size_t buffer, size_t n_bytes, unsigned long long offset, callback_t callback);
     void write_tensor(torch::Tensor tensor, unsigned long long offset, callback_t callback, std::optional<torch::Tensor> pinned);
     void synchronize();
diff --git a/include/backend.h b/include/backend.h
@@ -14,6 +14,6 @@ std::string get_default_backend();
 
 bool get_debug_flag();
 
-AsyncIO *create_asyncio(unsigned int n_entries, std::string backend);
+AsyncIO *create_asyncio(unsigned int n_entries, std::string backend, unsigned int n_tasks);
 
 std::string get_debug_log();
diff --git a/include/pthread_backend.h b/include/pthread_backend.h
@@ -26,6 +26,7 @@ class PthreadAsyncIO : public AsyncIO
 private:
     BS::thread_pool pool;
     std::atomic<unsigned int> h2d_in_progress;
+    unsigned int total_h2d;
     std::condition_variable cv;
     std::mutex mtx;
     std::deque<std::tuple<std::future<ssize_t>, callback_t>> write_fut;
@@ -34,10 +35,11 @@ class PthreadAsyncIO : public AsyncIO
     const std::string debug_log = get_debug_log();
 
     std::atomic<unsigned int> tasks_in_progress;
+    unsigned int total_tasks;
 
 public:
-    PthreadAsyncIO(unsigned int n_entries)
-        : pool(n_entries), h2d_in_progress(0) {}
+    PthreadAsyncIO(unsigned int n_entries, unsigned int n_tasks)
+        : pool(n_entries), h2d_in_progress(0), tasks_in_progress(0), total_tasks(n_tasks), total_h2d(0) {}
 
     ~PthreadAsyncIO() {}
 
diff --git a/include/uring.h b/include/uring.h
@@ -13,7 +13,7 @@ class UringAsyncIO : public AsyncIO
     void get_event(WaitType wt);
 
 public:
-    UringAsyncIO(unsigned int n_entries);
+    UringAsyncIO(unsigned int n_entries, unsigned int n_tasks);
     ~UringAsyncIO();
 
     void write(int fd, void *buffer, size_t n_bytes, unsigned long long offset, callback_t callback);
diff --git a/tensornvme/_C/__init__.pyi b/tensornvme/_C/__init__.pyi
@@ -20,7 +20,7 @@ def get_backends() -> Set[str]: ...
 def probe_backend(backend: str) -> bool: ...
 
 class AsyncFileWriter:
-    def __init__(self, fd: int, n_entries: int, backend: str = "aio") -> None: ...
+    def __init__(self, fd: int, n_entries: int, backend: str = "aio", n_tasks: int = 0) -> None: ...
     def write(self, buffer: int, n_bytes: int, offset: int, callback: Optional[Callable[[], None]] = None) -> None: ...
     def write_tensor(
         self,
diff --git a/tensornvme/async_file_io.py b/tensornvme/async_file_io.py
@@ -10,14 +10,14 @@
 
 
 class AsyncFileWriter:
-    def __init__(self, path: str, n_entries: int = 16, backend=None) -> None:
+    def __init__(self, path: str, n_entries: int = 16, backend=None, n_tasks: int = 0) -> None:
         # this still takes ram buffer, which may lead to OOM
         # self.f = open(path, "wb", buffering=0)
         self.fd = os.open(path, os.O_WRONLY | os.O_CREAT, mode=0o664)
         if backend is not None:
-            self.io = AsyncFileWriterC(self.fd, n_entries, backend=backend)
+            self.io = AsyncFileWriterC(self.fd, n_entries, backend=backend, n_tasks=n_tasks)
         else:
-            self.io = AsyncFileWriterC(self.fd, n_entries)
+            self.io = AsyncFileWriterC(self.fd, n_entries, n_tasks=n_tasks)
         self.offset = 0
         # must ensure the data is not garbage collected
         self.buffers = []

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`#include "aio.h"`
`2`	`2`
`3`		`-AIOAsyncIO::AIOAsyncIO(unsigned int n_entries)`
	`3`	`+AIOAsyncIO::AIOAsyncIO(unsigned int n_entries, unsigned int n_tasks)`
`4`	`4`	`{`
`5`	`5`	`// printf("Initializing the io Context\n");`
`6`	`6`	`this->max_nr = n_entries;`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`#include "async_file_io.h"`
`2`	`2`
`3`		`-AsyncFileWriter::AsyncFileWriter(int fd, unsigned int n_entries, const std::string &backend) : fd(fd), aio(create_asyncio(n_entries, backend)) {}`
	`3`	`+AsyncFileWriter::AsyncFileWriter(int fd, unsigned int n_entries, const std::string &backend, unsigned int n_tasks) : fd(fd), aio(create_asyncio(n_entries, backend, n_tasks)) {}`
`4`	`4`
`5`	`5`	`void AsyncFileWriter::write(size_t buffer, size_t n_bytes, unsigned long long offset, callback_t callback)`
`6`	`6`	`{`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ iovec *tensors_to_iovec(const std::vector<at::Tensor> &tensors)`
`28`	`28`
`29`	`29`	`Offloader::Offloader(const std::string &filename, unsigned int n_entries, const std::string &backend) : filename(filename), space_mgr(SpaceManager(0))`
`30`	`30`	`{`
`31`		`- this->aio = create_asyncio(n_entries, backend);`
	`31`	`+ this->aio = create_asyncio(n_entries, backend, 0);`
`32`	`32`	`this->fd = open(filename.c_str(), O_RDWR \| O_CREAT, S_IRUSR \| S_IWUSR);`
`33`	`33`	`this->aio->register_file(fd);`
`34`	`34`	`}`
Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,8 @@ void PthreadAsyncIO::write(int fd, void *buffer, size_t n_bytes, unsigned long l`
`16`	`16`	`auto val = pwrite(fd, buffer, n_bytes, offset);`
`17`	`17`	`if (this->is_debug)`
`18`	`18`	`{`
`19`		`- auto cur_tasks = this->tasks_in_progress.fetch_sub(1);`
`20`		`- if (cur_tasks == 1)`
	`19`	`+ auto cur_tasks = this->tasks_in_progress.fetch_add(1);`
	`20`	`+ if (cur_tasks + 1 == this->total_tasks)`
`21`	`21`	`{`
`22`	`22`	`if (this->debug_log.empty())`
`23`	`23`	`{`
`@@ -117,23 +117,23 @@ void PthreadAsyncIO::register_file(int fd) {}`
`117`	`117`
`118`	`118`	`void PthreadAsyncIO::register_h2d(unsigned int num_tensors)`
`119`	`119`	`{`
`120`		`- this->h2d_in_progress.store(num_tensors); // register tensors to write for this run`
	`120`	`+ this->total_h2d = num_tensors;`
`121`	`121`	`}`
`122`	`122`
`123`	`123`	`void PthreadAsyncIO::sync_h2d()`
`124`	`124`	`{`
`125`	`125`	`std::unique_lock<std::mutex> lock(this->mtx);`
`126`	`126`	`this->cv.wait(lock, [this]`
`127`		`- { return this->h2d_in_progress == 0; }); // block until all in-progress h2d are completed`
	`127`	`+ { return this->h2d_in_progress == this->total_h2d; }); // block until all in-progress h2d are completed`
`128`	`128`	`}`
`129`	`129`
`130`	`130`	`void PthreadAsyncIO::write_tensor(int fd, torch::Tensor t, unsigned long long offset, callback_t callback, std::optional<torch::Tensor> pinned)`
`131`	`131`	`{`
`132`	`132`	`auto stream = c10::cuda::getCurrentCUDAStream();`
`133`	`133`	`if (!t.is_cuda())`
`134`	`134`	`{`
`135`		`- this->h2d_in_progress.fetch_sub(1); // already moved to cpu`
`136`		`- if (this->h2d_in_progress.load() == 0)`
	`135`	`+ auto cur_h2d = this->h2d_in_progress.fetch_add(1); // already moved to cpu`
	`136`	`+ if (cur_h2d + 1 == this->total_h2d)`
`137`	`137`	`{ // notify when all h2d are completed and safe to optimizer.step()`
`138`	`138`	`std::lock_guard<std::mutex> lock(this->mtx);`
`139`	`139`	`cv.notify_one();`
`@@ -155,8 +155,8 @@ void PthreadAsyncIO::write_tensor(int fd, torch::Tensor t, unsigned long long of`
`155`	`155`	`{`
`156`	`156`	`cpu_tensor = t.to(t.options().device(c10::DeviceType::CPU), /non_blocking/ false, /copy/ false); // modified from torch::Tensor::cpu()`
`157`	`157`	`}`
`158`		`- this->h2d_in_progress.fetch_sub(1);`
`159`		`- if (this->h2d_in_progress.load() == 0)`
	`158`	`+ auto cur_h2d = this->h2d_in_progress.fetch_add(1);`
	`159`	`+ if (cur_h2d + 1 == this->total_h2d)`
`160`	`160`	`{ // notify when all h2d are completed and safe to optimizer.step()`
`161`	`161`	`std::lock_guard<std::mutex> lock(this->mtx);`
`162`	`162`	`cv.notify_one();`
`@@ -171,8 +171,8 @@ void PthreadAsyncIO::write_tensor(int fd, torch::Tensor t, unsigned long long of`
`171`	`171`	`auto val = pwrite(fd, buf, n_bytes, offset);`
`172`	`172`	`if (this->is_debug)`
`173`	`173`	`{`
`174`		`- auto cur_tasks = this->tasks_in_progress.fetch_sub(1);`
`175`		`- if (cur_tasks == 1)`
	`174`	`+ auto cur_tasks = this->tasks_in_progress.fetch_add(1);`
	`175`	`+ if (cur_tasks + 1 == this->total_tasks)`
`176`	`176`	`{`
`177`	`177`	`if (this->debug_log.empty())`
`178`	`178`	`{`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`#include <memory>`
`3`	`3`	`#include "uring.h"`
`4`	`4`
`5`		`-UringAsyncIO::UringAsyncIO(unsigned int n_entries) : n_write_events(0), n_read_events(0), n_entries(n_entries)`
	`5`	`+UringAsyncIO::UringAsyncIO(unsigned int n_entries, unsigned int n_tasks) : n_write_events(0), n_read_events(0), n_entries(n_entries)`
`6`	`6`	`{`
`7`	`7`	`io_uring_queue_init(n_entries, &this->ring, 0);`
`8`	`8`	`}`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`class AsyncFileWriter`
`18`	`18`	`{`
`19`	`19`	`public:`
`20`		`- AsyncFileWriter(int fd, unsigned int n_entries, const std::string &backend);`
	`20`	`+ AsyncFileWriter(int fd, unsigned int n_entries, const std::string &backend, unsigned int n_tasks);`
`21`	`21`	`void write(size_t buffer, size_t n_bytes, unsigned long long offset, callback_t callback);`
`22`	`22`	`void write_tensor(torch::Tensor tensor, unsigned long long offset, callback_t callback, std::optional<torch::Tensor> pinned);`
`23`	`23`	`void synchronize();`